Skip to content

Commit

Permalink
PRD: fixed how RT TPS procedures are banned from processing
Browse files Browse the repository at this point in the history
If banned, the TPS procedure must be prevented from being added to the
queue. This fixes the issue where background scrubbing gets stopped
manually and the TPS procedure is never executed. This also fixes a
flooding issue when TPS is not available to fix persistent errors.

Change-Id: I76cc9f7ce7c06587261ff593a626a4ef51b317e1
RTC: 192009
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57919
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/58327
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed May 5, 2018
1 parent 3d5c1c5 commit e811117
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 25 deletions.
22 changes: 22 additions & 0 deletions src/usr/diag/prdf/common/plat/mem/prdfMemDbUtils.H
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,28 @@ uint32_t handleTdEvent<TARGETING::TYPE_MBA>( ExtensibleChip * i_chip,
return getMbaDataBundle(i_chip)->getTdCtlr()->handleTdEvent( io_sc );
}

/**
* @brief Generic wrapper to tell the TD controller to ban TPS on a rank.
* @param i_chip MCA or MBA.
* @param i_rank The target slave rank.
*/
template<TARGETING::TYPE T>
void banTps( ExtensibleChip * i_chip, const MemRank & i_rank );

template<> inline
void banTps<TARGETING::TYPE_MCA>( ExtensibleChip * i_chip,
const MemRank & i_rank )
{
getMcaDataBundle(i_chip)->getTdCtlr()->banTps( i_chip, i_rank );
}

template<> inline
void banTps<TARGETING::TYPE_MBA>( ExtensibleChip * i_chip,
const MemRank & i_rank )
{
getMbaDataBundle(i_chip)->getTdCtlr()->banTps( i_chip, i_rank );
}

#endif // Hostboot Runtime only

} // end namespace MemDbUtils
Expand Down
5 changes: 1 addition & 4 deletions src/usr/diag/prdf/common/plat/mem/prdfP9McaDataBundle.H
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2016,2017 */
/* Contributors Listed Below - COPYRIGHT 2016,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -202,9 +202,6 @@ class McaDataBundle : public DataBundle
* scrubbing is resumed. */
bool iv_maskMainlineNceTce = false;

/** Map to keep track of ranks that have banned TPS. */
std::map<MemRank, bool> iv_tpsBans;

#endif
};

Expand Down
43 changes: 39 additions & 4 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,20 @@ class MemTdCtlr
* diagnostics, if not already in progress.
* @param i_entry The new TD queue entry.
*/
void pushToQueue( TdEntry * i_entry ) { iv_queue.push(i_entry); }
void pushToQueue( TdEntry * i_entry )
{
#ifdef __HOSTBOOT_RUNTIME
if ( TdEntry::TPS_EVENT == i_entry->getType() &&
isTpsBanned(i_entry->getChip(), i_entry->getRank()) )
{
PRDF_ERR( "[MemTdCtlr::pushToQueue] TPS banned on 0x%08x 0x%02x",
i_entry->getChip()->getHuid(), i_entry->getRank() );
return; // prevent the entry from being added to the queue.
}
#endif

iv_queue.push(i_entry);
}

#ifdef __HOSTBOOT_RUNTIME

Expand All @@ -106,6 +119,18 @@ class MemTdCtlr
*/
uint32_t handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Bans TPS on the given rank. Any attempts to add a TPS procedure
* to the queue for this rank will be ignored.
* @param i_rank The target slave rank.
*/
void banTps( ExtensibleChip * i_chip, const MemRank & i_rank )
{
// It doesn't matter what we set the value to, we just need to make sure
// the rank exists in the map.
iv_tpsBans[std::make_pair(i_chip, i_rank)] = true;
}

/**
* @brief Handles reset-reload or FO scenario.
*
Expand Down Expand Up @@ -237,6 +262,17 @@ class MemTdCtlr

#ifdef __HOSTBOOT_RUNTIME

/**
* @param i_rank The target slave rank.
* @return True, if this slave rank has been banned. False, otherwise.
*/
bool isTpsBanned( ExtensibleChip * i_chip, const MemRank & i_rank )
{
// Check if this rank exists in the map.
std::pair<ExtensibleChip *, MemRank> e = std::make_pair(i_chip, i_rank);
return ( iv_tpsBans.end() != iv_tpsBans.find(e) );
}

/**
* @brief Masks NCE and TCE ECC attentions.
* @note Only intended to be used just before starting a new TD procedure.
Expand Down Expand Up @@ -287,9 +323,8 @@ class MemTdCtlr
* to be restarted with a new command. */
bool iv_resumeBgScrub = false;

/** Keeps track if the fetch attentions have been masked during a TD
* procedure. */
bool iv_fetchAttnsMasked = false;
/** Map to keep track of ranks that have banned TPS. */
std::map< std::pair<ExtensibleChip *, MemRank>, bool > iv_tpsBans;

#else // IPL only

Expand Down
20 changes: 16 additions & 4 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,22 @@ uint32_t MemTdCtlr<T>::handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc )
// Don't interrupt a TD procedure if one is already in progress.
if ( nullptr != iv_curProcedure ) break;

// If the queue is empty, there is nothing to do. So there is no point
// to stopping background scrub. This could have happen if TPS was
// banned on a rank and the TPS request was never added to the queue. In
// that case, mask fetch attentions temporarily to prevent flooding.
if ( iv_queue.empty() )
{
o_rc = maskEccAttns();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "maskEccAttns() failed" );
break;
}

break; // Don't stop background scrub.
}

// Stop background scrubbing.
o_rc = stopBgScrub<T>( iv_chip );
if ( SUCCESS != o_rc )
Expand Down Expand Up @@ -849,8 +865,6 @@ uint32_t MemTdCtlr<TYPE_MBA>::maskEccAttns()
break;
}

iv_fetchAttnsMasked = true;

} while (0);

return o_rc;
Expand Down Expand Up @@ -903,8 +917,6 @@ uint32_t MemTdCtlr<TYPE_MBA>::unmaskEccAttns()
break;
}

iv_fetchAttnsMasked = false;

} while (0);

return o_rc;
Expand Down
3 changes: 3 additions & 0 deletions src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ class TdEntry
/** @return The event type */
TdType getType() const { return iv_tdType; }

/** @return The chip in which this event occurred */
ExtensibleChip * getChip() const { return iv_chip; }

/** @return The rank in which this event occurred */
MemRank getRank() const { return iv_rank; }

Expand Down
15 changes: 2 additions & 13 deletions src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,13 @@
/** @file prdfMemTps_rt.C */

// Platform includes
#include <prdfCenMbaDataBundle.H>
#include <prdfMemDbUtils.H>
#include <prdfMemEccAnalysis.H>
#include <prdfMemMark.H>
#include <prdfMemScrubUtils.H>
#include <prdfMemTdFalseAlarm.H>
#include <prdfMemTps.H>
#include <prdfP9McaExtraSig.H>
#include <prdfP9McaDataBundle.H>
#include <prdfTargetServices.H>

using namespace TARGETING;
Expand Down Expand Up @@ -350,9 +349,7 @@ uint32_t TpsEvent<T>::analyzeTpsPhase1_rt( STEP_CODE_DATA_STRUCT & io_sc,
// If iv_ban is true and this procedure is done, then ban TPS on this rank.
if ( iv_ban && o_done )
{
// It doesn't matter what we set the value to, we just need to
// make sure the rank exists in the map.
getMcaDataBundle(iv_chip)->iv_tpsBans[iv_rank] = true;
MemDbUtils::banTps<T>( iv_chip, iv_rank );

// Permanently mask mainline NCEs and TCEs.
getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
Expand Down Expand Up @@ -1145,14 +1142,6 @@ uint32_t TpsEvent<TYPE_MCA>::nextStep( STEP_CODE_DATA_STRUCT & io_sc,

do
{
// Check if TPS is banned on this rank.
if ( 1 == getMcaDataBundle(iv_chip)->iv_tpsBans.count(iv_rank) )
{
// If TPS is banned, abort the procedure.
o_done = true;
break;
}

// Runtime TPS is slightly different than IPL TPS or any other TD event.
// There really is only one phase, but we use two phases to help
// differentiate between the CE types that are collected. So only one of
Expand Down

0 comments on commit e811117

Please sign in to comment.