Skip to content

Commit

Permalink
PRD: Add threshold for stopping on UEs/CEs during BgScrub
Browse files Browse the repository at this point in the history
Change-Id: Iddeec04300631fc57b5c2f4a2eb57302e9f98fe2
CQ: SW476467
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84071
Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com>
Reviewed-by: Paul Greenwood <paul.greenwood@ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84073
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
cnpalmer authored and zane131 committed Sep 30, 2019
1 parent 0f3e041 commit 70a4b6b
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 16 deletions.
60 changes: 46 additions & 14 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
Expand Up @@ -1623,21 +1623,53 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
// can use the stop conditions, which should be unique for background scrub,
// to determine if it has been configured.

SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
o_rc = reg->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
iv_chip->getHuid() );
}
else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH
reg->IsBitSet(34) && // pause on MPE
reg->IsBitSet(35) ) // pause on UE
do
{
o_canResume = true;
}
SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
o_rc = reg->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
iv_chip->getHuid() );
break;
}
// Note: The stop conditions for background scrubbing can now be
// variable depending on whether we have hit threshold for the number
// of UEs or CEs that we have stopped on on a rank.

// If we haven't hit CE or UE threshold, check the CE stop conditions
if ( !getMcbistDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() &&
!getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
{
// If the stop conditions aren't set, just break out.
if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH
{
break;
}

}

// If we haven't hit UE threshold yet, check the UE stop condition
if ( !getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
{
// If the stop condition isn't set, just break out
if ( !reg->IsBitSet(35) ) // pause on UE
{
break;
}
}

// Need to check the stop on mpe stop condition regardless of whether
// we hit the UE or CE threshold.
if ( reg->IsBitSet(34) ) // pause on MPE
{
// If we reach here, all the stop conditions are set for background
// scrub, so we can resume.
o_canResume = true;
}
}while(0);

return o_rc;

Expand Down
13 changes: 12 additions & 1 deletion src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2016 */
/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -81,6 +81,17 @@ class McbistDataBundle : public DataBundle

/** The Targeted Diagnostics controller. */
MemTdCtlr<TARGETING::TYPE_MCBIST> * iv_tdCtlr = nullptr;

public: // instance variables
#ifdef __HOSTBOOT_RUNTIME

// These are used to limit the number of times a scrub command will stop
// on a UE or CE on a rank. This is to prevent potential flooding of
// maintenance UEs or CEs. The threshold will be 16 per rank for each.
ScrubResumeCounter iv_ueScrubStopCounter;
ScrubResumeCounter iv_ceScrubStopCounter;

#endif
};

/**
Expand Down
7 changes: 7 additions & 0 deletions src/usr/diag/prdf/plat/prdfPlatServices.C
Expand Up @@ -40,6 +40,7 @@
#include <prdfRegisterCache.H>

#include <prdfCenMbaDataBundle.H>
#include <prdfP9McbistDataBundle.H>
#include <prdfMemScrubUtils.H>

#include <iipServiceDataCollector.h>
Expand Down Expand Up @@ -791,6 +792,12 @@ uint32_t startBgScrub<TYPE_MCA>( ExtensibleChip * i_mcaChip,
ExtensibleChip * mcbChip = getConnectedParent( i_mcaChip, TYPE_MCBIST );
fapi2::Target<fapi2::TARGET_TYPE_MCBIST> fapiTrgt ( mcbChip->getTrgt() );

#ifdef __HOSTBOOT_RUNTIME
// Starting a new command. Clear the UE and CE scrub stop counters
getMcbistDataBundle( mcbChip )->iv_ueScrubStopCounter.reset();
getMcbistDataBundle( mcbChip )->iv_ceScrubStopCounter.reset();
#endif

// Get the stop conditions.
// NOTE: If HBRT_PRD is not configured, we want to use the defaults so that
// background scrubbing never stops.
Expand Down
39 changes: 38 additions & 1 deletion src/usr/diag/prdf/plat/prdfPlatServices_rt.C
Expand Up @@ -37,6 +37,7 @@

// Platform includes
#include <prdfCenMbaDataBundle.H>
#include <prdfP9McbistDataBundle.H>
#include <prdfMemScrubUtils.H>
#include <prdfPlatServices.H>

Expand Down Expand Up @@ -173,9 +174,45 @@ uint32_t resumeBgScrub<TYPE_MCBIST>( ExtensibleChip * i_chip )
break;
}

// Check UE and CE stop counters to determine stop conditions
mss::mcbist::stop_conditions<> stopCond;
if ( getMcbistDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() )
{
// If we've reached the limit of UEs we're allowed to stop on
// per rank, only set the stop on mpe stop condition.
stopCond.set_pause_on_mpe(mss::ON);
}
else if ( getMcbistDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() )
{
// If we've reached the limit of CEs we're allowed to stop on
// per rank, set all the normal stop conditions except stop on CE
stopCond.set_pause_on_aue(mss::ON);

#ifdef CONFIG_HBRT_PRD

stopCond.set_pause_on_mpe(mss::ON)
.set_pause_on_ue(mss::ON);

// In MNFG mode, stop on RCE_ETE to get an accurate callout for IUEs
if ( mfgMode() ) stopCond.set_thresh_rce(1);

#endif
}
else
{
// If we haven't reached threshold on the number of UEs or CEs we
// have stopped on, do not change the stop conditions.
stopCond = mss::mcbist::stop_conditions<>(
mss::mcbist::stop_conditions<>::DONT_CHANGE );
}

// Resume the command on the next address.
// Note: we have to limit the number of times a command has been stopped
// because of a UE/CE. Therefore, we must always resume the command to
// the end of the current slave rank so we can reset the UE/CE counts.
errlHndl_t errl;
FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt );
FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt,
mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond );

if ( nullptr != errl )
{
Expand Down

0 comments on commit 70a4b6b

Please sign in to comment.