From 70a4b6bf1a02fbe7d71a67f6ebae6e75f0e20c95 Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Mon, 23 Sep 2019 08:18:27 -0500 Subject: [PATCH] PRD: Add threshold for stopping on UEs/CEs during BgScrub Change-Id: Iddeec04300631fc57b5c2f4a2eb57302e9f98fe2 CQ: SW476467 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84071 Reviewed-by: Benjamen G Tyner Reviewed-by: Paul Greenwood Tested-by: Jenkins Server Reviewed-by: Zane C Shelley Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84073 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins --- src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 60 ++++++++++++++----- .../prdf/plat/mem/prdfP9McbistDataBundle.H | 13 +++- src/usr/diag/prdf/plat/prdfPlatServices.C | 7 +++ src/usr/diag/prdf/plat/prdfPlatServices_rt.C | 39 +++++++++++- 4 files changed, 103 insertions(+), 16 deletions(-) diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index 779d13a73db..817a345f350 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -1623,21 +1623,53 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) // can use the stop conditions, which should be unique for background scrub, // to determine if it has been configured. - SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); - o_rc = reg->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", - iv_chip->getHuid() ); - } - else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH - 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH - 0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH - reg->IsBitSet(34) && // pause on MPE - reg->IsBitSet(35) ) // pause on UE + do { - o_canResume = true; - } + SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", + iv_chip->getHuid() ); + break; + } + // Note: The stop conditions for background scrubbing can now be + // variable depending on whether we have hit threshold for the number + // of UEs or CEs that we have stopped on on a rank. + + // If we haven't hit CE or UE threshold, check the CE stop conditions + if ( !getMcbistDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() && + !getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() ) + { + // If the stop conditions aren't set, just break out. + if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH + 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH + 0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH + { + break; + } + + } + + // If we haven't hit UE threshold yet, check the UE stop condition + if ( !getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() ) + { + // If the stop condition isn't set, just break out + if ( !reg->IsBitSet(35) ) // pause on UE + { + break; + } + } + + // Need to check the stop on mpe stop condition regardless of whether + // we hit the UE or CE threshold. + if ( reg->IsBitSet(34) ) // pause on MPE + { + // If we reach here, all the stop conditions are set for background + // scrub, so we can resume. + o_canResume = true; + } + }while(0); return o_rc; diff --git a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H index 4a284253a3e..3883eb9369a 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H +++ b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -81,6 +81,17 @@ class McbistDataBundle : public DataBundle /** The Targeted Diagnostics controller. */ MemTdCtlr * iv_tdCtlr = nullptr; + + public: // instance variables + #ifdef __HOSTBOOT_RUNTIME + + // These are used to limit the number of times a scrub command will stop + // on a UE or CE on a rank. This is to prevent potential flooding of + // maintenance UEs or CEs. The threshold will be 16 per rank for each. + ScrubResumeCounter iv_ueScrubStopCounter; + ScrubResumeCounter iv_ceScrubStopCounter; + + #endif }; /** diff --git a/src/usr/diag/prdf/plat/prdfPlatServices.C b/src/usr/diag/prdf/plat/prdfPlatServices.C index 48d26f8b8ee..e4b122ab56e 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices.C @@ -40,6 +40,7 @@ #include #include +#include #include #include @@ -791,6 +792,12 @@ uint32_t startBgScrub( ExtensibleChip * i_mcaChip, ExtensibleChip * mcbChip = getConnectedParent( i_mcaChip, TYPE_MCBIST ); fapi2::Target fapiTrgt ( mcbChip->getTrgt() ); + #ifdef __HOSTBOOT_RUNTIME + // Starting a new command. Clear the UE and CE scrub stop counters + getMcbistDataBundle( mcbChip )->iv_ueScrubStopCounter.reset(); + getMcbistDataBundle( mcbChip )->iv_ceScrubStopCounter.reset(); + #endif + // Get the stop conditions. // NOTE: If HBRT_PRD is not configured, we want to use the defaults so that // background scrubbing never stops. diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C index e3bdecb5966..cd0b045dc1f 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C @@ -37,6 +37,7 @@ // Platform includes #include +#include #include #include @@ -173,9 +174,45 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) break; } + // Check UE and CE stop counters to determine stop conditions + mss::mcbist::stop_conditions<> stopCond; + if ( getMcbistDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() ) + { + // If we've reached the limit of UEs we're allowed to stop on + // per rank, only set the stop on mpe stop condition. + stopCond.set_pause_on_mpe(mss::ON); + } + else if ( getMcbistDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() ) + { + // If we've reached the limit of CEs we're allowed to stop on + // per rank, set all the normal stop conditions except stop on CE + stopCond.set_pause_on_aue(mss::ON); + + #ifdef CONFIG_HBRT_PRD + + stopCond.set_pause_on_mpe(mss::ON) + .set_pause_on_ue(mss::ON); + + // In MNFG mode, stop on RCE_ETE to get an accurate callout for IUEs + if ( mfgMode() ) stopCond.set_thresh_rce(1); + + #endif + } + else + { + // If we haven't reached threshold on the number of UEs or CEs we + // have stopped on, do not change the stop conditions. + stopCond = mss::mcbist::stop_conditions<>( + mss::mcbist::stop_conditions<>::DONT_CHANGE ); + } + // Resume the command on the next address. + // Note: we have to limit the number of times a command has been stopped + // because of a UE/CE. Therefore, we must always resume the command to + // the end of the current slave rank so we can reset the UE/CE counts. errlHndl_t errl; - FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt ); + FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt, + mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond ); if ( nullptr != errl ) {