Skip to content

Commit

Permalink
PRD: Update CE/UE flood threshold to reset on new ranks
Browse files Browse the repository at this point in the history
Change-Id: I89dce691642ebf4d753812bfae111d14c52753e3
CQ: SW480922
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/87032
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/88203
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
cnpalmer authored and zane131 committed Dec 6, 2019
1 parent 946a75d commit 5743467
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 47 deletions.
Expand Up @@ -5,7 +5,9 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* COPYRIGHT International Business Machines Corp. 2003,2014 */
/* Contributors Listed Below - COPYRIGHT 2003,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
Expand Down Expand Up @@ -100,10 +102,11 @@ class ThresholdResolution : public MaskResolution

enum TimeBase
{
ONE_SEC = 1,
ONE_MIN = ONE_SEC * 60,
ONE_HOUR = ONE_MIN * 60,
ONE_DAY = ONE_HOUR * 24,
ONE_SEC = 1,
ONE_MIN = ONE_SEC * 60,
ONE_HOUR = ONE_MIN * 60,
TEN_HOURS = ONE_HOUR * 10,
ONE_DAY = ONE_HOUR * 24,

NONE = 0xffffffff,
};
Expand Down
28 changes: 28 additions & 0 deletions src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
Expand Up @@ -127,6 +127,20 @@ uint32_t handleMemUe<TYPE_MCA>( ExtensibleChip * i_chip, const MemAddr & i_addr,
i_chip->getHuid(), i_type );
break;
}

#ifdef __HOSTBOOT_RUNTIME
// Increment the UE counter and store the rank we're on, resetting
// the UE and CE counts if we have stopped on a new rank.
ExtensibleChip * mcb = getConnectedParent( i_chip, TYPE_MCBIST );
McbistDataBundle * mcbdb = getMcbistDataBundle(mcb);
if ( mcbdb->iv_ceUeRank != i_addr.getRank() )
{
mcbdb->iv_ceStopCounter.reset();
mcbdb->iv_ueStopCounter.reset();
}
mcbdb->iv_ueStopCounter.inc( io_sc );
mcbdb->iv_ceUeRank = i_addr.getRank();
#endif
}

} while (0);
Expand Down Expand Up @@ -180,6 +194,20 @@ uint32_t handleMemUe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
i_chip->getHuid(), i_type );
break;
}

#ifdef __HOSTBOOT_RUNTIME
// Increment the UE counter and store the rank we're on, resetting
// the UE and CE counts if we have stopped on a new rank.
OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip);
if ( ocmbdb->iv_ceUeRank != i_addr.getRank() )
{
ocmbdb->iv_ceStopCounter.reset();
ocmbdb->iv_ueStopCounter.reset();
}
ocmbdb->iv_ueStopCounter.inc( io_sc );
ocmbdb->iv_ceUeRank = i_addr.getRank();
#endif

}

} while (0);
Expand Down
11 changes: 9 additions & 2 deletions src/usr/diag/prdf/common/plat/mem/prdfOcmbDataBundle.H
Expand Up @@ -210,8 +210,15 @@ class OcmbDataBundle : public DataBundle
// These are used to limit the number of times a scrub command will stop
// on a UE or CE on a rank. This is to prevent potential flooding of
// maintenance UEs or CEs. The threshold will be 16 per rank for each.
ScrubResumeCounter iv_ueScrubStopCounter;
ScrubResumeCounter iv_ceScrubStopCounter;
TimeBasedThreshold iv_ueStopCounter =
TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );
TimeBasedThreshold iv_ceStopCounter =
TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );;

// If we stop on a UE or a CE, we will need to store the rank that the
// error is on so that we can clear our respective thresholds if the
// next error we stop on is on a different rank.
MemRank iv_ceUeRank;

#else // IPL only

Expand Down
2 changes: 1 addition & 1 deletion src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
Expand Up @@ -384,7 +384,7 @@ uint32_t MemTdCtlr<T>::analyzeCmdComplete( bool & o_errorsFound,
// of in defaultStep() because a TD procedure could have been run
// before defaultStep() and it is possible that canResumeBgScrub()
// could give as a false positive in that case.
o_rc = canResumeBgScrub( iv_resumeBgScrub );
o_rc = canResumeBgScrub( iv_resumeBgScrub, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "canResumeBgScrub(0x%08x) failed",
Expand Down
4 changes: 3 additions & 1 deletion src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
Expand Up @@ -294,9 +294,11 @@ class MemTdCtlr
/**
* @param o_canResume True, if background scrubbing can be resumed. False,
* if a new background scrub command must be started.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
uint32_t canResumeBgScrub( bool & o_canResume );
uint32_t canResumeBgScrub( bool & o_canResume,
STEP_CODE_DATA_STRUCT & io_sc );

#endif

Expand Down
69 changes: 55 additions & 14 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
Expand Up @@ -313,7 +313,7 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc )
PRDF_TRAC( PRDF_FUNC "Calling resumeBgScrub<T>(0x%08x)",
iv_chip->getHuid() );

o_rc = resumeBgScrub<T>( iv_chip );
o_rc = resumeBgScrub<T>( iv_chip, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "resumeBgScrub<T>(0x%08x) failed",
Expand Down Expand Up @@ -388,10 +388,48 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip,
uint32_t count = symData.size();
switch ( T )
{
case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break;
case TYPE_MBA: PRDF_ASSERT( 1 == count ); break;
case TYPE_OCMB_CHIP: PRDF_ASSERT( 1 <= count && count <= 2 ); break;
default: PRDF_ASSERT( false );
case TYPE_MCA:
{
PRDF_ASSERT( 1 <= count && count <= 2 );
// Increment the CE counter and store the rank we're on,
// reset the UE and CE counts if we have stopped on a new rank.
ExtensibleChip * mcb = getConnectedParent(i_chip, TYPE_MCBIST);
McbistDataBundle * mcbdb = getMcbistDataBundle(mcb);
if ( mcbdb->iv_ceUeRank != i_addr.getRank() )
{
mcbdb->iv_ceStopCounter.reset();
mcbdb->iv_ueStopCounter.reset();
}
mcbdb->iv_ceStopCounter.inc( io_sc );
mcbdb->iv_ceUeRank = i_addr.getRank();

break;
}
case TYPE_MBA:
{
PRDF_ASSERT( 1 == count );
break;
}
case TYPE_OCMB_CHIP:
{
PRDF_ASSERT( 1 <= count && count <= 2 );
// Increment the UE counter and store the rank we're on,
// reset the UE and CE counts if we have stopped on a new rank.
OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip);
if ( ocmbdb->iv_ceUeRank != i_addr.getRank() )
{
ocmbdb->iv_ceStopCounter.reset();
ocmbdb->iv_ueStopCounter.reset();
}
ocmbdb->iv_ceStopCounter.inc( io_sc );
ocmbdb->iv_ceUeRank = i_addr.getRank();

break;
}
default:
{
PRDF_ASSERT( false );
}
}

for ( auto & d : symData )
Expand Down Expand Up @@ -1607,7 +1645,8 @@ uint32_t MemTdCtlr<TYPE_MBA>::handleRrFo()
//------------------------------------------------------------------------------

template<>
uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub] "

Expand Down Expand Up @@ -1638,8 +1677,8 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
// of UEs or CEs that we have stopped on on a rank.

// If we haven't hit CE or UE threshold, check the CE stop conditions
if ( !getMcbistDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() &&
!getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
if ( !getMcbistDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) &&
!getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
{
// If the stop conditions aren't set, just break out.
if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
Expand All @@ -1652,7 +1691,7 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
}

// If we haven't hit UE threshold yet, check the UE stop condition
if ( !getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
if ( !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
{
// If the stop condition isn't set, just break out
if ( !reg->IsBitSet(35) ) // pause on UE
Expand All @@ -1677,7 +1716,8 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
}

template<>
uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume )
uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub] "

Expand Down Expand Up @@ -1708,8 +1748,8 @@ uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume )
// of UEs or CEs that we have stopped on on a rank.

// If we haven't hit CE or UE threshold, check the CE stop conditions
if ( !getOcmbDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() &&
!getOcmbDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
if ( !getOcmbDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) &&
!getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
{
// If the stop conditions aren't set, just break out.
if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
Expand All @@ -1722,7 +1762,7 @@ uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume )
}

// If we haven't hit UE threshold yet, check the UE stop condition
if ( !getOcmbDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() )
if ( !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
{
// If the stop condition isn't set, just break out
if ( !reg->IsBitSet(35) ) // pause on UE
Expand All @@ -1747,7 +1787,8 @@ uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume )
}

template<>
uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume )
uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::canResumeBgScrub] "

Expand Down
12 changes: 10 additions & 2 deletions src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H
Expand Up @@ -36,6 +36,7 @@
// Platform includes
#include <prdfMemTdCtlr.H>
#include <prdfPlatServices.H>
#include <prdfThresholdUtils.H>

namespace PRDF
{
Expand Down Expand Up @@ -88,8 +89,15 @@ class McbistDataBundle : public DataBundle
// These are used to limit the number of times a scrub command will stop
// on a UE or CE on a rank. This is to prevent potential flooding of
// maintenance UEs or CEs. The threshold will be 16 per rank for each.
ScrubResumeCounter iv_ueScrubStopCounter;
ScrubResumeCounter iv_ceScrubStopCounter;
TimeBasedThreshold iv_ueStopCounter =
TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );
TimeBasedThreshold iv_ceStopCounter =
TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );

// If we stop on a UE or a CE, we will need to store the rank that the
// error is on so that we can clear our respective thresholds if the
// next error we stop on is on a different rank.
MemRank iv_ceUeRank;

#endif
};
Expand Down
8 changes: 4 additions & 4 deletions src/usr/diag/prdf/plat/prdfPlatServices.C
Expand Up @@ -865,8 +865,8 @@ uint32_t startBgScrub<TYPE_MCA>( ExtensibleChip * i_mcaChip,

#ifdef __HOSTBOOT_RUNTIME
// Starting a new command. Clear the UE and CE scrub stop counters
getMcbistDataBundle( mcbChip )->iv_ueScrubStopCounter.reset();
getMcbistDataBundle( mcbChip )->iv_ceScrubStopCounter.reset();
getMcbistDataBundle( mcbChip )->iv_ueStopCounter.reset();
getMcbistDataBundle( mcbChip )->iv_ceStopCounter.reset();
#endif

// Get the stop conditions.
Expand Down Expand Up @@ -1445,8 +1445,8 @@ uint32_t startBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_ocmb,
#ifdef __HOSTBOOT_RUNTIME
// Starting a new command. Clear the UE and CE scrub stop counters
getOcmbDataBundle( mcbChip )->iv_ueScrubStopCounter.reset();
getOcmbDataBundle( mcbChip )->iv_ceScrubStopCounter.reset();
getOcmbDataBundle( mcbChip )->iv_ueStopCounter.reset();
getOcmbDataBundle( mcbChip )->iv_ceStopCounter.reset();
#endif
// Get the stop conditions.
Expand Down
33 changes: 16 additions & 17 deletions src/usr/diag/prdf/plat/prdfPlatServices_rt.C
Expand Up @@ -152,7 +152,8 @@ uint32_t stopBgScrub<TYPE_MCA>( ExtensibleChip * i_chip )
//------------------------------------------------------------------------------

template<>
uint32_t resumeBgScrub<TYPE_MCBIST>( ExtensibleChip * i_chip )
uint32_t resumeBgScrub<TYPE_MCBIST>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[PlatServices::resumeBgScrub<TYPE_MCBIST>] "

Expand All @@ -177,13 +178,13 @@ uint32_t resumeBgScrub<TYPE_MCBIST>( ExtensibleChip * i_chip )

// Check UE and CE stop counters to determine stop conditions
mss::mcbist::stop_conditions<> stopCond;
if ( getMcbistDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() )
if ( getMcbistDataBundle(i_chip)->iv_ueStopCounter.thReached(io_sc) )
{
// If we've reached the limit of UEs we're allowed to stop on
// per rank, only set the stop on mpe stop condition.
stopCond.set_pause_on_mpe(mss::ON);
}
else if ( getMcbistDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() )
else if (getMcbistDataBundle(i_chip)->iv_ceStopCounter.thReached(io_sc))
{
// If we've reached the limit of CEs we're allowed to stop on
// per rank, set all the normal stop conditions except stop on CE
Expand All @@ -208,12 +209,9 @@ uint32_t resumeBgScrub<TYPE_MCBIST>( ExtensibleChip * i_chip )
}

// Resume the command on the next address.
// Note: we have to limit the number of times a command has been stopped
// because of a UE/CE. Therefore, we must always resume the command to
// the end of the current slave rank so we can reset the UE/CE counts.
errlHndl_t errl;
FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt,
mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond );
mss::mcbist::end_boundary::DONT_CHANGE, stopCond );

if ( nullptr != errl )
{
Expand All @@ -233,12 +231,14 @@ uint32_t resumeBgScrub<TYPE_MCBIST>( ExtensibleChip * i_chip )
//------------------------------------------------------------------------------

template<>
uint32_t resumeBgScrub<TYPE_MCA>( ExtensibleChip * i_chip )
uint32_t resumeBgScrub<TYPE_MCA>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MCA == i_chip->getType() );

return resumeBgScrub<TYPE_MCBIST>(getConnectedParent(i_chip, TYPE_MCBIST));
return resumeBgScrub<TYPE_MCBIST>(getConnectedParent(i_chip, TYPE_MCBIST),
io_sc);
}

//##############################################################################
Expand Down Expand Up @@ -378,7 +378,8 @@ uint32_t __resumeScrub<TYPE_MBA>( ExtensibleChip * i_chip,
//------------------------------------------------------------------------------

template<>
uint32_t resumeBgScrub<TYPE_MBA>( ExtensibleChip * i_chip )
uint32_t resumeBgScrub<TYPE_MBA>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MBA == i_chip->getType() );
Expand Down Expand Up @@ -458,7 +459,8 @@ uint32_t stopBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
//------------------------------------------------------------------------------

template<>
uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[PlatServices::resumeBgScrub<TYPE_OCMB_CHIP>] "

Expand All @@ -485,13 +487,13 @@ uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
// Check UE and CE stop counters to determine stop conditions
mss::mcbist::stop_conditions<> stopCond;
if ( getOcmbDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() )
if ( getOcmbDataBundle(i_chip)->iv_ueStopCounter.thReached(io_sc) )
{
// If we've reached the limit of UEs we're allowed to stop on
// per rank, only set the stop on mpe stop condition.
stopCond.set_pause_on_mpe(mss::ON);
}
else if ( getOcmbDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() )
else if ( getOcmbDataBundle(i_chip)->iv_ceStopCounter.thReached(io_sc) )
{
// If we've reached the limit of CEs we're allowed to stop on
// per rank, set all the normal stop conditions except stop on CE
Expand All @@ -516,12 +518,9 @@ uint32_t resumeBgScrub<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
}
// Resume the command on the next address.
// Note: we have to limit the number of times a command has been stopped
// because of a UE/CE. Therefore, we must always resume the command to
// the end of the current slave rank so we can reset the UE/CE counts.
errlHndl_t errl;
FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt,
mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond );
mss::mcbist::end_boundary::DONT_CHANGE, stopCond );
if ( nullptr != errl )
{
Expand Down

0 comments on commit 5743467

Please sign in to comment.