From 5e97dc7721eaabc9049b24430c3065acfffcd86e Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Sun, 1 Oct 2017 17:56:00 -0500 Subject: [PATCH] PRD: Runtime TPS Design Fixes Change-Id: I78675ac1b1f2fc5c3e331123d3e46bda2c2e643c CQ: SW403636 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46979 Tested-by: Jenkins Server Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Brian J. Stegmiller Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48074 Tested-by: FSP CI Jenkins Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW --- src/usr/diag/prdf/plat/mem/prdfMemTps.H | 40 +++-- src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 189 +++++++++++---------- 2 files changed, 122 insertions(+), 107 deletions(-) diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps.H b/src/usr/diag/prdf/plat/mem/prdfMemTps.H index 577de3d6a9d..6facb1877f2 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps.H @@ -99,37 +99,35 @@ class TpsEvent : public TdEntry /** * @brief Analyzes the counts that summarize the symbol CE counts. - * @param i_badDqCount Number of nibbles with a bad DQ - * @param i_badChipCount Number of nibbles with a bad Chip - * @param i_nonZeroSumCount Number of nibbles under threshold with a - * non-zero sum - * @param i_singleSymCount Number of nibbles under threshold with a - * single symbol count greater than one - * @param i_symList Vector of all symbols with counts > 0. + * @param i_badDqCount Number of nibbles with a bad DQ + * @param i_badChipCount Number of nibbles with a bad Chip + * @param i_sumAboveOneCount Number of nibbles under threshold with a + * sum greater than 1 + * @param i_singleSymCount Number of nibbles under threshold with only + * a single symbol with a non-zero count, and + * that count is > 1 * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ uint32_t analyzeCeSymbolCounts( CeCount i_badDqCount, - CeCount i_badChipCount, CeCount i_nonZeroSumCount, - CeCount i_singleSymCount, MemUtils::MaintSymbols i_symList, - STEP_CODE_DATA_STRUCT & io_sc ); + CeCount i_badChipCount, CeCount i_sumAboveOneCount, + CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ); /** * @brief Gets the counts that summarize the symbol CE counts. - * @param io_badDqCount Number of nibbles with a bad DQ - * @param io_badChipCount Number of nibbles with a bad Chip - * @param io_nonZeroSumCount Number of nibbles under threshold with a - * non-zero sum - * @param io_singleSymCount Number of nibbles under threshold with a - * single symbol count greater than one - * @param o_symList Vector of all symbols with CE counts > 0. - * @param io_sc The step code data struct. + * @param io_badDqCount Number of nibbles with a bad DQ + * @param io_badChipCount Number of nibbles with a bad Chip + * @param io_sumAboveOneCount Number of nibbles under threshold with a + * sum greater than 1 + * @param io_singleSymCount Number of nibbles under threshold with only + * a single symbol with a non-zero count, and + * that count is > 1 + * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ uint32_t getSymbolCeCounts( CeCount & io_badDqCount, - CeCount & io_badChipCount, CeCount & io_nonZeroSumCount, - CeCount & io_singleSymCount, MemUtils::MaintSymbols & o_symList, - STEP_CODE_DATA_STRUCT & io_sc ); + CeCount & io_badChipCount, CeCount & io_sumAboveOneCount, + CeCount & io_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ); /** * @brief Does isolation for CE attentions. diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 345d8a20ac4..ec0af140975 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -169,20 +169,30 @@ bool __badChipCount( MemUtils::MaintSymbols i_nibbleStats, //------------------------------------------------------------------------------ template -void __nonZeroSumCount( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_nonZeroSumCount ); +void __sumAboveOneCount( MemUtils::MaintSymbols i_nibbleStats, + CeCount & io_sumAboveOneCount ); template<> -void __nonZeroSumCount( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_nonZeroSumCount ) +void __sumAboveOneCount( MemUtils::MaintSymbols i_nibbleStats, + CeCount & io_sumAboveOneCount ) { + uint8_t sum = 0; + MemUtils::MaintSymbols symList; for ( auto symData : i_nibbleStats ) { - // If there is a non-zero sum. - if ( symData.count != 0 ) + if ( symData.count > 0 ) { - io_nonZeroSumCount.count++; - break; + sum += symData.count; + symList.push_back(symData); + } + } + // If the sum is greater than 1 + if ( sum > 1 ) + { + io_sumAboveOneCount.count++; + for ( auto sym : symList ) + { + io_sumAboveOneCount.symList.push_back(sym); } } } @@ -197,17 +207,25 @@ template<> void __singleSymbolCount( MemUtils::MaintSymbols i_nibbleStats, CeCount & io_singleSymCount ) { - // Count to keep track of the number of symbols whose CE count was > 1 uint8_t count = 0; + bool multNonZeroSyms = false; for ( auto symData : i_nibbleStats ) { - if ( symData.count > 1 ) - count++; + if ( symData.count > 0 ) + { + if ( 0 != count ) + { + // There are more than one symbol counts that are non-zero + multNonZeroSyms = true; + break; + } + count = symData.count; + } } - // If there was only 1 symbol whose CE count was > 1. - if ( 1 == count ) + // If there is only one symbol with a non-zero count and that count > 1 + if ( count > 1 && !multNonZeroSyms ) io_singleSymCount.count++; } @@ -216,7 +234,7 @@ void __singleSymbolCount( MemUtils::MaintSymbols i_nibbleStats, template void __analyzeNibbleSyms( MemUtils::MaintSymbols i_nibbleStats, CeCount & io_badDqCount, CeCount & io_badChipCount, - CeCount & io_nonZeroSumCount, CeCount & io_singleSymCount ) + CeCount & io_sumAboveOneCount, CeCount & io_singleSymCount ) { do @@ -229,11 +247,11 @@ void __analyzeNibbleSyms( MemUtils::MaintSymbols i_nibbleStats, if ( __badChipCount( i_nibbleStats, io_badChipCount ) ) break; - // Check if this nibble is under threshold with a non-zero sum. - __nonZeroSumCount( i_nibbleStats, io_nonZeroSumCount ); + // Check if this nibble is under threshold with a sum greater than 1. + __sumAboveOneCount( i_nibbleStats, io_sumAboveOneCount ); - // Check if this nibble is under threshold with a single symbol count - // greater than 1. + // Check if this nibble is under threshold with only a single symbol + // with a non-zero count, and that count is > 1. __singleSymbolCount( i_nibbleStats, io_singleSymCount ); }while(0); @@ -242,29 +260,26 @@ void __analyzeNibbleSyms( MemUtils::MaintSymbols i_nibbleStats, //------------------------------------------------------------------------------ template -uint32_t __updateVpdCountAboveOne( MemUtils::MaintSymbols i_symList, - MemDqBitmap & io_dqBitmap ) +uint32_t __updateVpdSumAboveOne( CeCount i_sumAboveOneCount, + MemDqBitmap & io_dqBitmap ) { - #define PRDF_FUNC "[__updateVpdCountAboveOne] " + #define PRDF_FUNC "[__updateVpdSumAboveOne] " uint32_t o_rc = SUCCESS; do { - // Update VPD with all symbols that have a count greater than 1. This is - // so if we do TPS again, we'll callout again even if the symbol - // counters change. - for ( auto sym : i_symList ) + // For nibbles under threshold with a sum greater than 1, update VPD + // with it's non-zero symbols. This is so if we do TPS again, we'll + // callout again even if the symbol counters change. + for ( auto sym : i_sumAboveOneCount.symList ) { - if ( sym.count > 1 ) + o_rc = io_dqBitmap.setSymbol( sym.symbol ); + if ( SUCCESS != o_rc ) { - o_rc = io_dqBitmap.setSymbol( sym.symbol ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "io_dqBitmap.setSymbol failed." ); - break; - } + PRDF_ERR( PRDF_FUNC "io_dqBitmap.setSymbol failed." ); + break; } } }while(0); @@ -452,9 +467,8 @@ uint32_t TpsEvent::analyzeEcc( const uint32_t & i_eccAttns, template<> uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, - CeCount i_badChipCount, CeCount i_nonZeroSumCount, - CeCount i_singleSymCount, MemUtils::MaintSymbols i_symList, - STEP_CODE_DATA_STRUCT & io_sc ) + CeCount i_badChipCount, CeCount i_sumAboveOneCount, + CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[TpsEvent::analyzeCeSymbolCounts] " @@ -512,10 +526,10 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, // If the symbol mark is available. if ( !symMark.isValid() ) { - // If the non-zero sum nibble count is <= 1 or non-zero sum + // If the sum above one nibble count is <= 1 or sum above one // nibble count == 2 and single sym nibble count == 2 - if ( (i_nonZeroSumCount.count <= 1) || - (i_nonZeroSumCount.count == 2 && + if ( (i_sumAboveOneCount.count <= 1) || + (i_sumAboveOneCount.count == 2 && i_singleSymCount.count == 2) ) { // This means we have a potential future chip kill or @@ -547,12 +561,13 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, else { // Placing a symbol mark risks a UE. - // Update VPD with all symbols that have a count > 1. - o_rc = __updateVpdCountAboveOne( - i_symList, dqBitmap ); + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( + i_sumAboveOneCount, dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); } @@ -582,10 +597,10 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, // If the symbol mark is available. if ( !symMark.isValid() ) { - // If the non-zero sum nibble count is = 0 or non-zero sum + // If the sum above one nibble count is = 0 or sum above one // nibble count = 1 and single sym nibble count = 1 - if ( (i_nonZeroSumCount.count == 0) || - (i_nonZeroSumCount.count == 1 && + if ( (i_sumAboveOneCount.count == 0) || + (i_sumAboveOneCount.count == 1 && i_singleSymCount.count == 1) ) { // This means we have only one more potential bad DQ, which @@ -627,12 +642,13 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, else { // Placing a symbol mark risks a UE. - // Update VPD with all symbols that have a count > 1. - o_rc = __updateVpdCountAboveOne( - i_symList, dqBitmap ); + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( + i_sumAboveOneCount, dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); } @@ -672,10 +688,10 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, // If the chip mark is available. if ( !chipMark.isValid() ) { - // If the non-zero sum nibble count is = 0 or the non-zero sum + // If the sum above one nibble count is = 0 or the sum above one // nibble count = 1 and the single sym nibble count = 1 - if ( (i_nonZeroSumCount.count == 0) || - (i_nonZeroSumCount.count == 1 && + if ( (i_sumAboveOneCount.count == 0) || + (i_sumAboveOneCount.count == 1 && i_singleSymCount.count == 1) ) { // This means we have only one more potential bad DQ, which @@ -705,12 +721,13 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, else { // Placing a mark risks a UE. - // Update VPD with all symbols that have a count > 1. - o_rc = __updateVpdCountAboveOne( - i_symList, dqBitmap ); + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( + i_sumAboveOneCount, dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); } @@ -749,8 +766,8 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, // If the chip mark is available. if ( !chipMark.isValid() ) { - // If the non-zero sum nibble count is 0 - if ( 0 == i_nonZeroSumCount.count ) + // If the sum above one nibble count is 0 + if ( 0 == i_sumAboveOneCount.count ) { // This means we have no more potential bad DQ or bad chips // since we can't correct those after chip mark is placed. @@ -783,12 +800,13 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, else { // Placing a chip mark risks a UE. - // Update VPD with all symbols that have a count > 1. - o_rc = __updateVpdCountAboveOne( - i_symList, dqBitmap ); + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( + i_sumAboveOneCount, dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); } @@ -805,8 +823,8 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, // If the symbol mark is available. if ( !symMark.isValid() ) { - // If the non-zero sum nibble count is 0 - if ( 0 == i_nonZeroSumCount.count ) + // If the sum above one nibble count is 0 + if ( 0 == i_sumAboveOneCount.count ) { // This means we have no more potential bad DQ or bad chips // since we can't correct those after symbol mark is placed. @@ -839,12 +857,13 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, else { // Placing the symbol mark risks a UE. - // Update VPD with all symbols that have a count > 1. - o_rc = __updateVpdCountAboveOne( - i_symList, dqBitmap ); + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( + i_sumAboveOneCount, dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); } @@ -863,12 +882,13 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, else { // There are enough errors that this could be a potential UE. - // Update VPD with all symbols that have a count > 1. - o_rc = __updateVpdCountAboveOne( - i_symList, dqBitmap ); + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( + i_sumAboveOneCount, dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); } @@ -931,9 +951,8 @@ uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, template<> uint32_t TpsEvent::getSymbolCeCounts( CeCount & io_badDqCount, - CeCount & io_badChipCount, CeCount & io_nonZeroSumCount, - CeCount & io_singleSymCount, MemUtils::MaintSymbols & o_symList, - STEP_CODE_DATA_STRUCT & io_sc ) + CeCount & io_badChipCount, CeCount & io_sumAboveOneCount, + CeCount & io_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[TpsEvent::getSymbolCeCounts] " @@ -1003,8 +1022,6 @@ uint32_t TpsEvent::getSymbolCeCounts( CeCount & io_badDqCount, symData.count = reg->GetBitFieldJustified(((i+n)*8), 8); nibbleStats.push_back( symData ); - if ( symData.count > 0 ) - o_symList.push_back( symData ); // Add all symbols with non-zero counts to the callout list. if ( symData.count != 0 ) @@ -1017,7 +1034,7 @@ uint32_t TpsEvent::getSymbolCeCounts( CeCount & io_badDqCount, // Analyze the nibble of symbols. __analyzeNibbleSyms( nibbleStats, io_badDqCount, - io_badChipCount, io_nonZeroSumCount, io_singleSymCount ); + io_badChipCount, io_sumAboveOneCount, io_singleSymCount ); } if ( SUCCESS != o_rc ) break; @@ -1045,14 +1062,14 @@ uint32_t TpsEvent::analyzeCe( STEP_CODE_DATA_STRUCT & io_sc ) // The symbol CE counts will be summarized in the following buckets: // Number of nibbles with a bad DQ // Number of nibbles with a bad chip - // Number of nibbles under threshold with a non-zero sum - // Number of nibbles under threshold with a single symbol count > 1 - CeCount badDqCount, badChipCount, nonZeroSumCount, singleSymCount; - MemUtils::MaintSymbols symList; + // Number of nibbles under threshold with a sum greater than 1 + // Number of nibbles under threshold with only a single symbol with a + // non-zero count, and that count is > 1 + CeCount badDqCount, badChipCount, sumAboveOneCount, singleSymCount; // Get the symbol CE counts. - o_rc = getSymbolCeCounts( badDqCount, badChipCount, nonZeroSumCount, - singleSymCount, symList, io_sc ); + o_rc = getSymbolCeCounts( badDqCount, badChipCount, sumAboveOneCount, + singleSymCount, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getSymbolCeCounts failed." ); @@ -1071,8 +1088,8 @@ uint32_t TpsEvent::analyzeCe( STEP_CODE_DATA_STRUCT & io_sc ) } // Analyze the symbol CE counts. - o_rc = analyzeCeSymbolCounts( badDqCount, badChipCount, nonZeroSumCount, - singleSymCount, symList, io_sc ); + o_rc = analyzeCeSymbolCounts(badDqCount, badChipCount, sumAboveOneCount, + singleSymCount, io_sc); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "analyzeCeSymbolCounts failed." );