Skip to content

Commit

Permalink
PRD: Maint soft/inter/hard CE handling during background scrub for Ce…
Browse files Browse the repository at this point in the history
…ntaur

Change-Id: I9363812d7e3a7fcca46e481c6250d810bfcd970a
RTC: 192638
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/58980
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59229
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed May 23, 2018
1 parent 0d6b900 commit e940af9
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 245 deletions.
219 changes: 0 additions & 219 deletions src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C
Original file line number Diff line number Diff line change
Expand Up @@ -468,124 +468,6 @@ int32_t CenMbaTdCtlr::initialize()

//------------------------------------------------------------------------------

int32_t CenMbaTdCtlr::analyzeCmdComplete( STEP_CODE_DATA_STRUCT & io_sc,
const CenAddr & i_stopAddr,
const CenAddr & i_endAddr )
{
#define PRDF_FUNC "[CenMbaTdCtlr::analyzeCmdComplete] "

int32_t o_rc = SUCCESS;

do
{
if ( NO_OP != iv_tdState )
{
PRDF_ERR( PRDF_FUNC "Invalid state machine configuration" );
o_rc = FAIL; break;
}

// Initialize iv_rank. This must be done before calling other
// functions as they require iv_rank to be accurate.
iv_rank = i_stopAddr.getRank();

// Background scrubbing was interrupted, most likely because of an ECC
// error, so set the interrupted rank in the rank list.
o_rc = iv_masterRanks.setInterruptedRank( iv_rank );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "setInterruptedRank() failed" );
break;
}

// Get all reported error conditions.
uint16_t eccErrorMask = NO_ERROR;
o_rc = checkEccErrors( eccErrorMask, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "checkEccErrors() failed" );
break;
}

// The order of the following checks is important. Each call to handle
// an error will set the PRD signature and override the previous
// signature. We want the highest priority error signature (memory UEs)
// to be displayed so these checks should be ordered from lowest to
// highest priority.

if ( (eccErrorMask & SOFT_CTE) || (eccErrorMask & INTER_CTE) )
{
o_rc = handleSoftIntCeEte_NonTd( io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleSoftIntCeEte_NonTd() failed" );
break;
}
}

if ( eccErrorMask & HARD_CTE )
{
o_rc = handleHardCeEte_NonTd( io_sc, i_stopAddr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleHardCeEte_NonTd() failed" );
break;
}
}

if ( iv_queue.empty() )
{
// No TD requests so resume background. If the scrub reached the end
// address, start background scrubbing on the next good rank.
// Otherwise, resume the current scrub.

if ( i_endAddr == i_stopAddr )
{
if ( (NO_ERROR == eccErrorMask) || (MCE == eccErrorMask) )
{
// The scrub completed without an error (this function
// currently ignores MCEs). Don't commit the error log
// (reduces informational error logs).
io_sc.service_data->setDontCommitErrl();
}

o_rc = startBgScrub( io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "startBgScrub() failed" );
break;
}
}
else
{
// Restart the scrub on the next address.
o_rc = resumeScrub( io_sc, eccErrorMask );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "resumeScrub() failed" );
break;
}
}
}
else
{
// A TD request was added to the queue, start the next TD request.
o_rc = startNextTd( io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "startNextTd() failed" );
break;
}
}

} while(0);

return o_rc;

#undef PRDF_FUNC
}

//------------------------------------------------------------------------------

int32_t CenMbaTdCtlr::analyzeVcmPhase1( STEP_CODE_DATA_STRUCT & io_sc,
const CenAddr & i_stopAddr,
const CenAddr & i_endAddr )
Expand Down Expand Up @@ -1926,107 +1808,6 @@ int32_t CenMbaTdCtlr::handleCeEte_Tps( STEP_CODE_DATA_STRUCT & io_sc )

//------------------------------------------------------------------------------

int32_t CenMbaTdCtlr::handleHardCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc,
const CenAddr & i_addr )
{
#define PRDF_FUNC "[CenMbaTdCtlr::handleHardCeEte_NonTd] "

int32_t o_rc = SUCCESS;

setTdSignature( io_sc, PRDFSIG_MaintHARD_CTE );

do
{
// Send page deallocation message to PHYP
o_rc = DEALLOC::pageGard( iv_mbaChip, i_addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "pageGard() failed" );
break;
}

// Get the failing symbol. Note that the hard CE threshold is 1 so there
// should only be one symbol with a non-zero per symbol count.

MaintSymbols symData; CenSymbol junk;
o_rc = collectCeStats( iv_mbaChip, iv_rank, symData, junk );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "collectCeStats() failed." );
break;
}

if ( 1 != symData.size() )
{
PRDF_ERR( PRDF_FUNC "collectCeStats() return size %d, but was "
"expecting size 1", symData.size() );
o_rc = FAIL;
break;
}

CenSymbol symbol = symData[0].symbol;

// Callout the symbol.
MemoryMru memmru ( iv_mbaTrgt, iv_rank, symbol );
io_sc.service_data->SetCallout( memmru );

// Add entry to CE table and add a TPS request to the queue, if needed.
CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );
if ( mbadb->iv_ceTable.addEntry(i_addr, symbol, true) )
{
o_rc = addTdQueueEntryTPS( iv_rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "addTdQueueEntryTPS() failed" );
break;
}
}

// Any hard CEs in MNFG should be immediately reported.
if ( mfgMode() )
io_sc.service_data->setServiceCall();

} while(0);

return o_rc;

#undef PRDF_FUNC
}

//------------------------------------------------------------------------------

int32_t CenMbaTdCtlr::handleSoftIntCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[CenMbaTdCtlr::handleSoftIntCeEte_NonTd] "

int32_t o_rc = SUCCESS;

setTdSignature( io_sc, PRDFSIG_MaintNCE_CTE );

do
{
// Callout the rank. Note that the per CE counters only capture hard CEs
// so it is not possible to isolate any further than a rank.
MemoryMru memmru ( iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK );
io_sc.service_data->SetCallout( memmru );

// Add a TPS request to the queue.
o_rc = addTdQueueEntryTPS( iv_rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "addTdQueueEntryTPS() failed" );
break;
}

} while(0);

return o_rc;

#undef PRDF_FUNC
}

//------------------------------------------------------------------------------

int32_t CenMbaTdCtlr::handleTpsFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[CenMbaTdCtlr::handleTpsFalseAlarm] "
Expand Down
16 changes: 0 additions & 16 deletions src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H
Original file line number Diff line number Diff line change
Expand Up @@ -261,22 +261,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon
*/
int32_t handleCeEte_Tps( STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Handles hard CE ETEs during background scrub.
* @param io_sc The step code data struct.
* @param i_addr The address in which the maintenance command stopped.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
int32_t handleHardCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc,
const CenAddr & i_addr );

/**
* @brief Handles soft and intermittent CEs during background scrub.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
int32_t handleSoftIntCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Handles TPS false alarms.
* @param io_sc The step code data struct.
Expand Down
52 changes: 42 additions & 10 deletions src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,42 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip, TdQueue & io_queue,

//------------------------------------------------------------------------------

template<TARGETING::TYPE T>
uint32_t __handleSoftInterCeEte( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr,
STEP_CODE_DATA_STRUCT & io_sc );

template<>
uint32_t __handleSoftInterCeEte<TYPE_MCA>( ExtensibleChip * i_chip,
TdQueue & io_queue,
const MemAddr & i_addr,
STEP_CODE_DATA_STRUCT & io_sc )
{
return __handleNceEte<TYPE_MCA>( i_chip, io_queue, i_addr, io_sc );
}

template<>
uint32_t __handleSoftInterCeEte<TYPE_MBA>( ExtensibleChip * i_chip,
TdQueue & io_queue,
const MemAddr & i_addr,
STEP_CODE_DATA_STRUCT & io_sc )
{
// Due to workarounds on the Centaur we are unable to stop on each
// occurrence of the soft or intermittent CEs like we do for Nimbus.
// Instead, the threshold is set much higher. If the threshold is hit we
// simply want to add the rank to the callout list and trigger TPS.

MemoryMru mm { i_chip->getTrgt(), i_addr.getRank(),
MemoryMruData::CALLOUT_RANK };
io_sc.service_data->SetCallout( mm );

io_queue.push( new TpsEvent<TYPE_MBA>(i_chip, i_addr.getRank()) );

return SUCCESS;
}

//------------------------------------------------------------------------------

template<TARGETING::TYPE T>
uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemRank & i_rank, bool & o_errorsFound,
Expand Down Expand Up @@ -673,10 +709,10 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
o_errorsFound = true;
io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintINTER_CTE);

o_rc = __handleNceEte<T>( i_chip, io_queue, i_addr, io_sc );
o_rc = __handleSoftInterCeEte<T>( i_chip, io_queue, i_addr, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "__handleNceEte<T>(0x%08x) failed",
PRDF_ERR( PRDF_FUNC "__handleSoftInterCeEte<T>(0x%08x) failed",
huid );
break;
}
Expand All @@ -687,10 +723,10 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
o_errorsFound = true;
io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintSOFT_CTE );

o_rc = __handleNceEte<T>( i_chip, io_queue, i_addr, io_sc );
o_rc = __handleSoftInterCeEte<T>( i_chip, io_queue, i_addr, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "__handleNceEte<T>(0x%08x) failed",
PRDF_ERR( PRDF_FUNC "__handleSoftInterCeEte<T>(0x%08x) failed",
huid );
break;
}
Expand Down Expand Up @@ -781,14 +817,10 @@ template
uint32_t __checkEcc<TYPE_MCA>( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc );
template<>
template
uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc )
{
// TODO: remove this once runtime support is abled for MBA.
return SUCCESS;
}
STEP_CODE_DATA_STRUCT & io_sc );

//------------------------------------------------------------------------------

Expand Down

0 comments on commit e940af9

Please sign in to comment.