Skip to content

Commit

Permalink
PRD: Avoid gard for NVDIMMs
Browse files Browse the repository at this point in the history
Change-Id: Icaa517b196826c2b442da769ef45b3cdf56e6a9d
CQ: SW467502
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/79189
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamen G. Tyner <ben.tyner@ibm.com>
Reviewed-by: Paul Greenwood <paul.greenwood@ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/79670
  • Loading branch information
cnpalmer authored and zane131 committed Jun 28, 2019
1 parent 1581c67 commit 8db1ba5
Show file tree
Hide file tree
Showing 14 changed files with 202 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2012,2018 */
/* Contributors Listed Below - COPYRIGHT 2012,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -627,6 +627,11 @@ class ServiceDataCollector
*/
void clearMruListGard();

/**
* @brief Iterates the MRU list and clears gard for any NVDIMM targets.
*/
void clearNvdimmMruListGard();

/**
* @brief Iterates the MRU list and returns true if at least on target in
* the list is set to be garded.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2012,2015 */
/* Contributors Listed Below - COPYRIGHT 2012,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -177,6 +177,50 @@ void ServiceDataCollector::clearMruListGard()

//------------------------------------------------------------------------------

void ServiceDataCollector::clearNvdimmMruListGard()
{
#define PRDF_FUNC "[ServiceDataCollector::clearNvdimmMruListGard] "

// Loop through the MRU list.
for ( auto & mru : xMruList )
{
PRDcallout callout = mru.callout;
TargetHandle_t trgt = callout.getTarget();
if ( TYPE_DIMM == PlatServices::getTargetType(trgt) )
{
// If the callout target is an NVDIMM, do not gard it and send a
// message to PHYP/Hostboot that a save/restore may work.
if ( isNVDIMM(trgt) )
{
mru.gardState = NO_GARD;

#ifdef __HOSTBOOT_MODULE

#ifdef __HOSTBOOT_RUNTIME
// Hostboot runtime, send the message to PHYP
uint32_t l_rc = PlatServices::nvdimmNotifyPhypProtChange( trgt,
NVDIMM::NVDIMM_RISKY_HW_ERROR );
if ( SUCCESS != l_rc )
{
PRDF_TRAC( PRDF_FUNC "nvdimmNotifyPhypProtChange(0x%08x) "
"failed.", PlatServices::getHuid(trgt) );
continue;
}
#else
// IPL, set the appropriate internal attribute in Hostboot
trgt->setAttr<ATTR_NV_STATUS_FLAG>(0x40);
#endif

#endif // __HOSTBOOT_MODULE
}
}
}

#undef PRDF_FUNC
}

//------------------------------------------------------------------------------

bool ServiceDataCollector::isGardRequested()
{
bool gardRecordExit = false;
Expand Down
3 changes: 3 additions & 0 deletions src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
Original file line number Diff line number Diff line change
Expand Up @@ -1390,6 +1390,9 @@ uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank,
{
io_sc.service_data->setServiceCall();

// We want to try to avoid garding NVDIMMs, so clear gard for them now.
io_sc.service_data->clearNvdimmMruListGard();

#ifdef __HOSTBOOT_RUNTIME
// No more repairs left so no point doing any more TPS procedures.
MemDbUtils::banTps<T>( i_chip, i_rank );
Expand Down
14 changes: 7 additions & 7 deletions src/usr/diag/prdf/common/plat/nimbus/nimbus_mca.rule
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ group gMCACALFIR
/** MCACALFIR[0]
* A MBA recoverable error has occurred.
*/
(rMCACALFIR, bit(0)) ? self_th_1;
(rMCACALFIR, bit(0)) ? nvdimm_self_th_1;

/** MCACALFIR[1]
* MBA Nonrecoverable Error
Expand All @@ -251,7 +251,7 @@ group gMCACALFIR
/** MCACALFIR[2]
* Excessive refreshes to a single rank.
*/
(rMCACALFIR, bit(2)) ? self_th_32perDay;
(rMCACALFIR, bit(2)) ? nvdimm_self_th_32perDay;

/** MCACALFIR[3]
* Err detected in the MBA debug WAT logic
Expand All @@ -266,7 +266,7 @@ group gMCACALFIR
/** MCACALFIR[5]
* Calibration complete indication xout
*/
(rMCACALFIR, bit(5)) ? self_th_32perDay;
(rMCACALFIR, bit(5)) ? nvdimm_self_th_32perDay;

/** MCACALFIR[6]
* Emergency Throttle
Expand Down Expand Up @@ -533,7 +533,7 @@ group gMCAECCFIR
/** MCAECCFIR[42]
* SCOM_PARITY_CLASS_RECOVERABLE
*/
(rMCAECCFIR, bit(42)) ? self_th_1;
(rMCAECCFIR, bit(42)) ? nvdimm_self_th_1;

/** MCAECCFIR[43]
* SCOM_PARITY_CLASS_UNRECOVERABLE
Expand All @@ -548,7 +548,7 @@ group gMCAECCFIR
/** MCAECCFIR[45]
* WRITE_RMW_CE
*/
(rMCAECCFIR, bit(45)) ? self_th_32perDay;
(rMCAECCFIR, bit(45)) ? nvdimm_self_th_32perDay;

/** MCAECCFIR[46]
* WRITE_RMW_UE
Expand Down Expand Up @@ -686,12 +686,12 @@ group gDDRPHYFIR
/** DDRPHYFIR[60]
* Register PE 4 bit impact
*/
(rDDRPHYFIR, bit(60)) ? self_th_1;
(rDDRPHYFIR, bit(60)) ? nvdimm_self_th_1;

/** DDRPHYFIR[61]
* Register PE 1 bit impact
*/
(rDDRPHYFIR, bit(61)) ? self_th_1;
(rDDRPHYFIR, bit(61)) ? nvdimm_self_th_1;

};

Expand Down
22 changes: 20 additions & 2 deletions src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ actionclass rcd_parity_error
calloutSelfLowNoGard; # Self LOW
# Thresholding done in plugin
funccall("RcdParityError"); # Run TPS on TH for all MCA ranks
funccall("ClearNvdimmGardState"); # Clear gard for NVDIMMs
};

/** Handle Mainline IUEs */
Expand Down Expand Up @@ -125,19 +126,36 @@ actionclass maintenance_iaue_handling
/** MCA/UE algroithm, threshold 5 per day */
actionclass mca_ue_algorithm_th_5perDay
{
calloutSelfMed;
try( funccall("CheckForNvdimms"), calloutSelfMed );
threshold5pday;
funccall("mcaUeAlgorithm"); # must be called last
};

/** MCA/UE algroithm, threshold 1 */
actionclass mca_ue_algorithm_th_1
{
calloutSelfMed;
try( funccall("CheckForNvdimms"), calloutSelfMed );
threshold1;
funccall("mcaUeAlgorithm"); # must be called last
};

################################################################################
# NVDIMM callouts #
################################################################################

# Simple callouts that will avoid gard for NVDIMMs
actionclass nvdimm_self_th_1
{
try( funccall("CheckForNvdimms"), calloutSelfMed );
threshold1;
};

actionclass nvdimm_self_th_32perDay
{
try( funccall("CheckForNvdimms"), calloutSelfMed );
threshold32pday;
};

################################################################################
# Analyze groups
################################################################################
Expand Down
4 changes: 2 additions & 2 deletions src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist.rule
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
# Contributors Listed Below - COPYRIGHT 2016,2018
# Contributors Listed Below - COPYRIGHT 2016,2019
# [+] International Business Machines Corp.
#
#
Expand Down Expand Up @@ -599,7 +599,7 @@ group gMCBISTFIR
/** MCBISTFIR[13]
* SCOM_RECOVERABLE_REG_PE
*/
(rMCBISTFIR, bit(13)) ? self_th_1;
(rMCBISTFIR, bit(13)) ? nvdimm_self_th_1;

/** MCBISTFIR[14]
* SCOM_FATAL_REG_PE
Expand Down
13 changes: 12 additions & 1 deletion src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist_actions.rule
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
# Contributors Listed Below - COPYRIGHT 2016,2018
# Contributors Listed Below - COPYRIGHT 2016,2019
# [+] International Business Machines Corp.
#
#
Expand Down Expand Up @@ -36,6 +36,17 @@ actionclass command_addr_timeout
funccall("commandAddrTimeout");
};

################################################################################
# NVDIMM callouts #
################################################################################

# Simple callouts that will avoid gard for NVDIMMs
actionclass nvdimm_self_th_1
{
try( funccall("CheckForNvdimms"), calloutSelfMed );
threshold1;
};

###############################################################################
# Analyze groups
###############################################################################
Expand Down
4 changes: 2 additions & 2 deletions src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs.rule
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
# Contributors Listed Below - COPYRIGHT 2016,2018
# Contributors Listed Below - COPYRIGHT 2016,2019
# [+] International Business Machines Corp.
#
#
Expand Down Expand Up @@ -148,7 +148,7 @@ group gMCFIR
/** MCFIR[0]
* mc internal recoverable eror
*/
(rMCFIR, bit(0)) ? self_th_1;
(rMCFIR, bit(0)) ? nvdimm_self_th_1;

/** MCFIR[1]
* mc internal non recovervable error
Expand Down
13 changes: 12 additions & 1 deletion src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs_actions.rule
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
# Contributors Listed Below - COPYRIGHT 2018
# Contributors Listed Below - COPYRIGHT 2018,2019
# [+] International Business Machines Corp.
#
#
Expand All @@ -23,6 +23,17 @@
#
# IBM_PROLOG_END_TAG

################################################################################
# NVDIMM callouts #
################################################################################

# Simple callouts that will avoid gard for NVDIMMs
actionclass nvdimm_self_th_1
{
try( funccall("CheckForNvdimms"), calloutSelfMed );
threshold1;
};

################################################################################
# Analyze groups
################################################################################
Expand Down
1 change: 0 additions & 1 deletion src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule
Original file line number Diff line number Diff line change
Expand Up @@ -279,4 +279,3 @@ actionclass chip_to_chip
calloutSelfMed;
threshold1;
};

60 changes: 60 additions & 0 deletions src/usr/diag/prdf/common/plat/p9/prdfCommonPlugins.C
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,66 @@ PRDF_PLUGIN_DEFINE_NS(nimbus_proc, CommonPlugins, ClearServiceCallFlag_mnfgInfo
PRDF_PLUGIN_DEFINE_NS(cumulus_proc, CommonPlugins, ClearServiceCallFlag_mnfgInfo);
PRDF_PLUGIN_DEFINE_NS(axone_proc, CommonPlugins, ClearServiceCallFlag_mnfgInfo);

/**
* @brief Will change the gard state of any NVDIMMs in the callout list to
* NO_GARD.
* @param i_chip The chip.
* @param io_sc The step code data struct.
* @returns SUCCESS
*/
int32_t ClearNvdimmGardState( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#ifdef __HOSTBOOT_MODULE

// Call the sdc to clear the NVDIMM mru list.
io_sc.service_data->clearNvdimmMruListGard();

#endif

return SUCCESS;
}
PRDF_PLUGIN_DEFINE_NS(nimbus_mcs, CommonPlugins, ClearNvdimmGardState);
PRDF_PLUGIN_DEFINE_NS(nimbus_mca, CommonPlugins, ClearNvdimmGardState);
PRDF_PLUGIN_DEFINE_NS(nimbus_mcbist, CommonPlugins, ClearNvdimmGardState);

/**
* @brief Will check if any of the DIMMs connected to this chip are NVDIMMs
* and callout self, no gard if there are.
* @param i_chip The chip of the DIMM parent.
* @param io_sc The step code data struct.
* @returns SUCCESS if NVDIMMs found, PRD_SCAN_COMM_REGISTER_ZERO if not.
*/
int32_t CheckForNvdimms( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
int32_t rc = PRD_SCAN_COMM_REGISTER_ZERO;

#ifdef __HOSTBOOT_MODULE

TargetHandleList dimmList = getConnected( i_chip->getTrgt(), TYPE_DIMM );

for ( auto & dimm : dimmList )
{
if ( isNVDIMM(dimm) )
{
// Callout self, no gard
io_sc.service_data->SetCallout(i_chip->getTrgt(), MRU_MED, NO_GARD);

// No need for other actions, so return SUCCESS
rc = SUCCESS;
break;
}
}

#endif

return rc;
}
PRDF_PLUGIN_DEFINE_NS(nimbus_mcs, CommonPlugins, CheckForNvdimms);
PRDF_PLUGIN_DEFINE_NS(nimbus_mca, CommonPlugins, CheckForNvdimms);
PRDF_PLUGIN_DEFINE_NS(nimbus_mcbist, CommonPlugins, CheckForNvdimms);

} // namespace CommonPlugins ends

}// namespace PRDF ends
Expand Down
14 changes: 14 additions & 0 deletions src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C
Original file line number Diff line number Diff line change
Expand Up @@ -1351,6 +1351,20 @@ int32_t dimmList( TargetHandleList & i_dimmList )
sendPredDeallocRequest( ssAddr, seAddr );
PRDF_TRAC( PRDF_FUNC "Predictive dealloc for start addr: 0x%016llx "
"end addr: 0x%016llx", ssAddr, seAddr );

// If the DIMM is an NVDIMM, send a message to PHYP that a save/restore
// may work.
if ( isNVDIMM(*it) )
{
uint32_t l_rc = PlatServices::nvdimmNotifyPhypProtChange( *it,
NVDIMM::NVDIMM_RISKY_HW_ERROR );
if ( SUCCESS != l_rc )
{
PRDF_TRAC( PRDF_FUNC "nvdimmNotifyPhypProtChange(0x%08x) "
"failed.", getHuid(*it) );
continue;
}
}
}

return o_rc;
Expand Down
4 changes: 2 additions & 2 deletions src/usr/diag/prdf/plat/mem/prdfP9Mca.C
Original file line number Diff line number Diff line change
Expand Up @@ -782,9 +782,9 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
// and make the log predictive.
io_sc.service_data->SetThresholdMaskId(0);

// Send persistency lost message to PHYP
// Send message to PHYP that save/restore may work
l_rc = PlatServices::nvdimmNotifyPhypProtChange( dimm,
NVDIMM::UNPROTECTED_BECAUSE_ERROR );
NVDIMM::NVDIMM_RISKY_HW_ERROR );
if ( SUCCESS != l_rc ) continue;

// Analyze Health Status0 Reg, Health Status1 Reg,
Expand Down

0 comments on commit 8db1ba5

Please sign in to comment.