Skip to content

Commit

Permalink
PRD: Axone UE/CE threshold for background scrub
Browse files Browse the repository at this point in the history
Change-Id: I2872ee94a385d84b10a6e0aaf7f3c0a95c496aa0
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84551
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com>
Reviewed-by: Brian J Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84781
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
  • Loading branch information
cnpalmer authored and zane131 committed Oct 4, 2019
1 parent 9d750b3 commit 2dbc309
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 20 deletions.
7 changes: 7 additions & 0 deletions src/usr/diag/prdf/common/plat/mem/prdfOcmbDataBundle.H
Expand Up @@ -40,6 +40,7 @@

#ifdef __HOSTBOOT_MODULE

#include <prdfMemScrubUtils.H>
#include <prdfMemTdFalseAlarm.H>
#include <prdfMemThresholds.H>
#include <prdfMemTdCtlr.H>
Expand Down Expand Up @@ -206,6 +207,12 @@ class OcmbDataBundle : public DataBundle
* scrubbing is resumed. */
bool iv_maskMainlineNceTce = false;

// These are used to limit the number of times a scrub command will stop
// on a UE or CE on a rank. This is to prevent potential flooding of
// maintenance UEs or CEs. The threshold will be 16 per rank for each.
ScrubResumeCounter iv_ueScrubStopCounter;
ScrubResumeCounter iv_ceScrubStopCounter;

#else // IPL only

/** MNFG IPL CE statistics. */
Expand Down
60 changes: 46 additions & 14 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
Expand Up @@ -1693,21 +1693,53 @@ uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume )
// can use the stop conditions, which should be unique for background scrub,
// to determine if it has been configured.

SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
o_rc = reg->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
iv_chip->getHuid() );
}
else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH
reg->IsBitSet(34) && // pause on MPE
reg->IsBitSet(35) ) // pause on UE
do
{
o_canResume = true;
}
SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
o_rc = reg->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
iv_chip->getHuid() );
break;
}
// Note: The stop conditions for background scrubbing can now be
// variable depending on whether we have hit threshold for the number
// of UEs or CEs that we have stopped on on a rank.

// If we haven't hit CE or UE threshold, check the CE stop conditions
if ( !getOcmbDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() &&
!getOcmbDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
{
// If the stop conditions aren't set, just break out.
if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH
{
break;
}

}

// If we haven't hit UE threshold yet, check the UE stop condition
if ( !getOcmbDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
{
// If the stop condition isn't set, just break out
if ( !reg->IsBitSet(35) ) // pause on UE
{
break;
}
}

// Need to check the stop on mpe stop condition regardless of whether
// we hit the UE or CE threshold.
if ( reg->IsBitSet(34) ) // pause on MPE
{
// If we reach here, all the stop conditions are set for background
// scrub, so we can resume.
o_canResume = true;
}
}while(0);

return o_rc;

Expand Down
7 changes: 7 additions & 0 deletions src/usr/diag/prdf/plat/prdfPlatServices.C
Expand Up @@ -41,6 +41,7 @@

#include <prdfCenMbaDataBundle.H>
#include <prdfP9McbistDataBundle.H>
#include <prdfOcmbDataBundle.H>
#include <prdfMemScrubUtils.H>

#include <iipServiceDataCollector.h>
Expand Down Expand Up @@ -1442,6 +1443,12 @@ uint32_t startBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_ocmb,
// Get the OCMB fapi target
fapi2::Target<fapi2::TARGET_TYPE_OCMB_CHIP> fapiTrgt (i_ocmb->getTrgt());
#ifdef __HOSTBOOT_RUNTIME
// Starting a new command. Clear the UE and CE scrub stop counters
getOcmbDataBundle( mcbChip )->iv_ueScrubStopCounter.reset();
getOcmbDataBundle( mcbChip )->iv_ceScrubStopCounter.reset();
#endif
// Get the stop conditions.
// NOTE: If HBRT_PRD is not configured, we want to use the defaults so that
// background scrubbing never stops.
Expand Down
46 changes: 40 additions & 6 deletions src/usr/diag/prdf/plat/prdfPlatServices_rt.C
Expand Up @@ -38,6 +38,7 @@
// Platform includes
#include <prdfCenMbaDataBundle.H>
#include <prdfP9McbistDataBundle.H>
#include <prdfOcmbDataBundle.H>
#include <prdfMemScrubUtils.H>
#include <prdfPlatServices.H>

Expand Down Expand Up @@ -467,10 +468,8 @@ uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
uint32_t o_rc = SUCCESS;

PRDF_TRAC( PRDF_FUNC "Function not supported yet" );

/* TODO RTC 207273 - no hwp support yet
// Get the OCMB_CHIP fapi target
/* TODO RTC 207273 - no HWP support yet
// Get the OCMB fapi target
fapi2::Target<fapi2::TARGET_TYPE_OCMB_CHIP> fapiTrgt ( i_chip->getTrgt() );
do
Expand All @@ -484,9 +483,45 @@ uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
break;
}
// Check UE and CE stop counters to determine stop conditions
mss::mcbist::stop_conditions<> stopCond;
if ( getOcmbDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() )
{
// If we've reached the limit of UEs we're allowed to stop on
// per rank, only set the stop on mpe stop condition.
stopCond.set_pause_on_mpe(mss::ON);
}
else if ( getOcmbDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() )
{
// If we've reached the limit of CEs we're allowed to stop on
// per rank, set all the normal stop conditions except stop on CE
stopCond.set_pause_on_aue(mss::ON);
#ifdef CONFIG_HBRT_PRD
stopCond.set_pause_on_mpe(mss::ON)
.set_pause_on_ue(mss::ON);
// In MNFG mode, stop on RCE_ETE to get an accurate callout for IUEs
if ( mfgMode() ) stopCond.set_thresh_rce(1);
#endif
}
else
{
// If we haven't reached threshold on the number of UEs or CEs we
// have stopped on, do not change the stop conditions.
stopCond = mss::mcbist::stop_conditions<>(
mss::mcbist::stop_conditions<>::DONT_CHANGE );
}
// Resume the command on the next address.
// Note: we have to limit the number of times a command has been stopped
// because of a UE/CE. Therefore, we must always resume the command to
// the end of the current slave rank so we can reset the UE/CE counts.
errlHndl_t errl;
FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt );
FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt,
mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond );
if ( nullptr != errl )
{
Expand All @@ -497,7 +532,6 @@ uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
}
} while (0);
*/

return o_rc;
Expand Down

0 comments on commit 2dbc309

Please sign in to comment.