Skip to content

Commit 51440d9

Browse files
cnpalmerzane131
authored andcommitted
PRD: NVDIMM callouts for register access errors
Change-Id: I2cef2d19db633d9cfb98f72cfd865ecdf960a1fe CQ: SW487048 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/92994 Reviewed-by: Brian J Stegmiller <bjs@us.ibm.com> Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Zane C Shelley <zshelle@us.ibm.com> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/93481 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
1 parent 820a099 commit 51440d9

File tree

2 files changed

+84
-38
lines changed

2 files changed

+84
-38
lines changed

src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2016,2019 */
8+
/* Contributors Listed Below - COPYRIGHT 2016,2020 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -102,18 +102,19 @@ PRDR_ERROR_SIGNATURE(EsLifeErr, 0xffff0090, "", "NVDIMM Final Energy Source
102102
PRDR_ERROR_SIGNATURE(EsTmpErrHigh, 0xffff0091, "", "NVDIMM Energy Source Temperature Error - High Temp Threshold");
103103
PRDR_ERROR_SIGNATURE(EsTmpErrLow, 0xffff0092, "", "NVDIMM Energy Source Temperature Error - Low Temp Threshold");
104104

105-
PRDR_ERROR_SIGNATURE(NvmLifeWarn1, 0xffff0093, "", "NVDIMM First NVM Lifetime Warning");
106-
PRDR_ERROR_SIGNATURE(NvmLifeWarn2, 0xffff0094, "", "NVDIMM Second NVM Lifetime Warning");
107-
PRDR_ERROR_SIGNATURE(EsLifeWarn1, 0xffff0095, "", "NVDIMM First Energy Source Lifetime Warning");
108-
PRDR_ERROR_SIGNATURE(EsLifeWarn2, 0xffff0096, "", "NVDIMM Second Energy Source Lifetime Warning");
109-
PRDR_ERROR_SIGNATURE(EsTmpWarnHigh, 0xffff0097, "", "NVDIMM Energy Source Temperature Warning - High Temp Threshold");
110-
PRDR_ERROR_SIGNATURE(EsTmpWarnLow, 0xffff0098, "", "NVDIMM Energy Source Temperature Warning - Low Temp Threshold");
111-
PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Threshold");
112-
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
113-
PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
114-
PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present");
115-
PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm");
116-
PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm");
105+
PRDR_ERROR_SIGNATURE(NvmLifeWarn1, 0xffff0093, "", "NVDIMM First NVM Lifetime Warning");
106+
PRDR_ERROR_SIGNATURE(NvmLifeWarn2, 0xffff0094, "", "NVDIMM Second NVM Lifetime Warning");
107+
PRDR_ERROR_SIGNATURE(EsLifeWarn1, 0xffff0095, "", "NVDIMM First Energy Source Lifetime Warning");
108+
PRDR_ERROR_SIGNATURE(EsLifeWarn2, 0xffff0096, "", "NVDIMM Second Energy Source Lifetime Warning");
109+
PRDR_ERROR_SIGNATURE(EsTmpWarnHigh, 0xffff0097, "", "NVDIMM Energy Source Temperature Warning - High Temp Threshold");
110+
PRDR_ERROR_SIGNATURE(EsTmpWarnLow, 0xffff0098, "", "NVDIMM Energy Source Temperature Warning - Low Temp Threshold");
111+
PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Threshold");
112+
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
113+
PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
114+
PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present");
115+
PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm");
116+
PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm");
117+
PRDR_ERROR_SIGNATURE(NvdimmReadFail, 0xffff009F, "", "NVDIMM Failure to read NVDIMM register");
117118

118119
#endif // __prdfMemExtraSig_H
119120

src/usr/diag/prdf/plat/mem/prdfP9Mca.C

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
/* */
66
/* OpenPOWER HostBoot Project */
77
/* */
8-
/* Contributors Listed Below - COPYRIGHT 2016,2019 */
8+
/* Contributors Listed Below - COPYRIGHT 2016,2020 */
99
/* [+] International Business Machines Corp. */
1010
/* */
1111
/* */
@@ -442,7 +442,7 @@ uint32_t __analyzeHealthStatus0Reg(STEP_CODE_DATA_STRUCT & io_sc,
442442
PRDF_ERR( PRDF_FUNC "Failed to read Health Status0 Register. "
443443
"HUID: 0x%08x", getHuid(i_dimm) );
444444
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
445-
o_rc = FAIL;
445+
o_rc = PRD_SCANCOM_FAILURE;
446446
break;
447447
}
448448
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
@@ -551,7 +551,7 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc,
551551
PRDF_ERR( PRDF_FUNC "Failed to read Health Status1 Register. "
552552
"HUID: 0x%08x", getHuid(i_dimm) );
553553
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
554-
o_rc = FAIL;
554+
o_rc = PRD_SCANCOM_FAILURE;
555555
break;
556556
}
557557
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
@@ -710,7 +710,7 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
710710
PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature MSB Register. "
711711
"HUID: 0x%08x", getHuid(i_dimm) );
712712
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
713-
o_rc = FAIL;
713+
o_rc = PRD_SCANCOM_FAILURE;
714714
break;
715715
}
716716

@@ -721,7 +721,7 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
721721
PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature LSB Register. "
722722
"HUID: 0x%08x", getHuid(i_dimm) );
723723
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
724-
o_rc = FAIL;
724+
o_rc = PRD_SCANCOM_FAILURE;
725725
break;
726726
}
727727

@@ -768,7 +768,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
768768
PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Status Reg. "
769769
"HUID: 0x%08x", getHuid(i_dimm) );
770770
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
771-
o_rc = FAIL;
771+
o_rc = PRD_SCANCOM_FAILURE;
772772
break;
773773
}
774774
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
@@ -899,7 +899,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
899899
PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Reg. HUID: "
900900
"0x%08x", getHuid(i_dimm) );
901901
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
902-
o_rc = FAIL;
902+
o_rc = PRD_SCANCOM_FAILURE;
903903
break;
904904
}
905905

@@ -912,7 +912,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
912912
PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Reg. HUID: "
913913
"0x%08x", getHuid(i_dimm) );
914914
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
915-
o_rc = FAIL;
915+
o_rc = PRD_SCANCOM_FAILURE;
916916
break;
917917
}
918918

@@ -941,7 +941,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
941941
PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification "
942942
"Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
943943
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
944-
o_rc = FAIL;
944+
o_rc = PRD_SCANCOM_FAILURE;
945945
break;
946946
}
947947

@@ -955,7 +955,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
955955
PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
956956
"Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
957957
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
958-
o_rc = FAIL;
958+
o_rc = PRD_SCANCOM_FAILURE;
959959
break;
960960
}
961961
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat );
@@ -991,7 +991,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
991991
PRDF_ERR( PRDF_FUNC "Failed to write Warning Threshold Reg. "
992992
"HUID: 0x%08x", getHuid(i_dimm) );
993993
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
994-
o_rc = FAIL;
994+
o_rc = PRD_SCANCOM_FAILURE;
995995
break;
996996
}
997997

@@ -1004,7 +1004,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
10041004
PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification "
10051005
"Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
10061006
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
1007-
o_rc = FAIL;
1007+
o_rc = PRD_SCANCOM_FAILURE;
10081008
break;
10091009
}
10101010

@@ -1017,7 +1017,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
10171017
PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
10181018
"Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
10191019
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
1020-
o_rc = FAIL;
1020+
o_rc = PRD_SCANCOM_FAILURE;
10211021
break;
10221022
}
10231023
bitList = __nvdimmGetActiveBits( notifStat );
@@ -1084,7 +1084,7 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
10841084
PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Status Reg. "
10851085
"HUID: 0x%08x", getHuid(i_dimm) );
10861086
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
1087-
o_rc = FAIL;
1087+
o_rc = PRD_SCANCOM_FAILURE;
10881088
break;
10891089
}
10901090
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
@@ -1298,7 +1298,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
12981298
PRDF_ERR( PRDF_FUNC "Failed to read NVDIMM_MGT_CMD1. "
12991299
"HUID: 0x%08x", getHuid(i_dimm) );
13001300
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
1301-
o_rc = FAIL;
1301+
o_rc = PRD_SCANCOM_FAILURE;
13021302
break;
13031303
}
13041304

@@ -1313,7 +1313,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
13131313
PRDF_ERR( PRDF_FUNC "Failed to write NVDIMM_MGT_CMD1. "
13141314
"HUID: 0x%08x", getHuid(i_dimm) );
13151315
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
1316-
o_rc = FAIL;
1316+
o_rc = PRD_SCANCOM_FAILURE;
13171317
break;
13181318
}
13191319

@@ -1378,6 +1378,20 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
13781378
PRDF_ERR( PRDF_FUNC "Failed to read Module Health Register. "
13791379
"HUID: 0x%08x", getHuid(dimm) );
13801380
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
1381+
1382+
// If we got a failure reading one of the NVDIMM registers,
1383+
// add a signature and make the log predictive.
1384+
__addSignature( io_sc, i_chip->getTrgt(), errFound,
1385+
PRDFSIG_NvdimmReadFail );
1386+
errFound = true;
1387+
io_sc.service_data->SetThresholdMaskId(0);
1388+
1389+
// Callout NVDIMM
1390+
io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
1391+
1392+
// Send message to PHYP that save/restore may work
1393+
l_rc = PlatServices::nvdimmNotifyProtChange( dimm,
1394+
NVDIMM::NVDIMM_RISKY_HW_ERROR );
13811395
continue;
13821396
}
13831397
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
@@ -1387,17 +1401,33 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
13871401
{
13881402
// Analyze Health Status0 Reg, Health Status1 Reg,
13891403
// and Error Theshold Status Reg
1390-
l_rc = __analyzeHealthStatus0Reg( io_sc, dimm, errFound );
1391-
if ( SUCCESS != l_rc ) continue;
1392-
l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
1393-
if ( SUCCESS != l_rc ) continue;
1394-
bool esTempErr = false;
1395-
l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr);
1396-
if ( SUCCESS != l_rc ) continue;
1404+
bool esTmpErr = false;
1405+
uint32_t l_rcStat0 = SUCCESS;
1406+
uint32_t l_rcStat1 = SUCCESS;
1407+
uint32_t l_rcErrTh = SUCCESS;
1408+
l_rcStat0 = __analyzeHealthStatus0Reg( io_sc, dimm, errFound );
1409+
l_rcStat1 = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
1410+
l_rcErrTh = __analyzeErrorThrStatusReg( io_sc, dimm, errFound,
1411+
esTmpErr );
1412+
1413+
if ( PRD_SCANCOM_FAILURE == l_rcStat0 ||
1414+
PRD_SCANCOM_FAILURE == l_rcStat1 ||
1415+
PRD_SCANCOM_FAILURE == l_rcErrTh )
1416+
{
1417+
// If we got a failure reading one of the NVDIMM registers,
1418+
// add a signature and make the log predictive.
1419+
__addSignature( io_sc, i_chip->getTrgt(), errFound,
1420+
PRDFSIG_NvdimmReadFail );
1421+
errFound = true;
1422+
io_sc.service_data->SetThresholdMaskId(0);
1423+
1424+
// Callout NVDIMM
1425+
io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
1426+
}
13971427

13981428
// If we hit an ES temperature error and have not yet hit threshold,
13991429
// then keep the log hidden.
1400-
if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue;
1430+
if ( esTmpErr && !io_sc.service_data->IsAtThreshold() ) continue;
14011431

14021432
// If we didn't find any error, then keep the log hidden.
14031433
if ( !errFound )
@@ -1426,7 +1456,22 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
14261456
else if ( bitList.count(1) )
14271457
{
14281458
l_rc = __analyzeWarningThrStatusReg( io_sc, dimm, errFound );
1429-
if ( SUCCESS != l_rc ) continue;
1459+
if ( PRD_SCANCOM_FAILURE == l_rc )
1460+
{
1461+
// If we got a failure reading one of the NVDIMM registers,
1462+
// add a signature and make the log predictive.
1463+
__addSignature( io_sc, i_chip->getTrgt(), errFound,
1464+
PRDFSIG_NvdimmReadFail );
1465+
errFound = true;
1466+
io_sc.service_data->SetThresholdMaskId(0);
1467+
1468+
// Callout NVDIMM
1469+
io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
1470+
1471+
// Send message to PHYP that save/restore may work
1472+
l_rc = PlatServices::nvdimmNotifyProtChange( dimm,
1473+
NVDIMM::NVDIMM_RISKY_HW_ERROR );
1474+
}
14301475

14311476
if ( !errFound )
14321477
{

0 commit comments

Comments
 (0)