Skip to content

Commit

Permalink
PRD: refined handleChnlFail() for Centaur
Browse files Browse the repository at this point in the history
Change-Id: I698b0320a51cb9452a6b0e661c6f3c24a8f70b78
RTC: 136123
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/58607
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59762
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed Jun 3, 2018
1 parent b14259e commit 9192da4
Show file tree
Hide file tree
Showing 12 changed files with 252 additions and 262 deletions.
27 changes: 24 additions & 3 deletions src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,23 @@ int32_t Initialize( ExtensibleChip * i_chip )
}
PRDF_PLUGIN_DEFINE( cen_centaur, Initialize );

/**
* @brief Analysis code that is called before the main analyze() function.
* @param i_chip A MEMBUF chip.
* @param io_sc The step code data struct.
* @param o_analyzed True if analysis is done on this chip, false otherwise.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
int32_t PreAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
bool & o_analyzed )
{
o_analyzed = false;

// Check for a channel failure before analyzing this chip.
return MemUtils::handleChnlFail<TYPE_MEMBUF>( i_chip, io_sc );
}
PRDF_PLUGIN_DEFINE( cen_centaur, PreAnalysis );

/**
* @brief Plugin function called after analysis is complete but before PRD
* exits.
Expand Down Expand Up @@ -108,11 +125,15 @@ int32_t analyzeDmiChnlFail( ExtensibleChip * i_mbChip,

do
{
// Query the connected DMI for channel fail attentions.
ExtensibleChip * dmiChip = getConnectedParent( i_mbChip, TYPE_DMI );

// TODO: RTC 136123 Need to call new interface that queries if there was
// a channel fail attention on the other side of the interface.
bool dmiChnlFail = false;
if ( SUCCESS != queryChnlFail<TYPE_DMI>(dmiChip, dmiChnlFail) )
{
PRDF_ERR( PRDF_FUNC "queryChnlFail(0x%08x) failed",
dmiChip->getHuid() );
break;
}

// If there is a channel fail attention on the other side of the bus,
// analyze the DMI target.
Expand Down
168 changes: 126 additions & 42 deletions src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
Original file line number Diff line number Diff line change
Expand Up @@ -506,69 +506,153 @@ void cleanupChnlAttns<TYPE_MEMBUF>( ExtensibleChip * i_chip,

//------------------------------------------------------------------------------

/* TODO RTC 136123
int32_t checkMcsChannelFail( ExtensibleChip * i_mcsChip,
STEP_CODE_DATA_STRUCT & io_sc )
template<TARGETING::TYPE T>
uint32_t __queryChnlFail( ExtensibleChip * i_chip, bool & o_chnlFail );

template<>
uint32_t __queryChnlFail<TYPE_MEMBUF>( ExtensibleChip * i_chip,
bool & o_chnlFail )
{
#define PRDF_FUNC "[MemUtils::checkMcsChannelFail] "
#define PRDF_FUNC "[MemUtils::__queryChnlFail] "

int32_t o_rc = SUCCESS;
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MEMBUF == i_chip->getType() );

do
{
// Skip if already handling unit checkstop.
if ( io_sc.service_data->IsUnitCS() )
break;
uint32_t o_rc = SUCCESS;

// Must be an MCS.
if ( TYPE_MCS != getTargetType(i_mcsChip->GetChipHandle()) )
{
PRDF_ERR( PRDF_FUNC "i_mcsChip is not TYPE_MCS" );
o_rc = FAIL; break;
}
o_chnlFail = false;

// Check MCIFIR[31] for presence of channel fail.
SCAN_COMM_REGISTER_CLASS * mcifir = i_mcsChip->getRegister("MCIFIR");
o_rc = mcifir->Read();
do
{
// Simply check the Centaur CS global reg for active attentions.
SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister("GLOBAL_CS_FIR");
o_rc = fir->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MCIFIR" );
PRDF_ERR( PRDF_FUNC "Failed to read GLOBAL_CS_FIR on 0x%08x",
i_chip->getHuid() );
break;
}

if ( !mcifir->IsBitSet(31) ) break; // No channel fail, so exit.
o_chnlFail = !fir->BitStringIsZero();

// Set unit checkstop flag and cause attention type.
io_sc.service_data->setFlag(ServiceDataCollector::UNIT_CS);
io_sc.service_data->setSecondaryAttnType(UNIT_CS);
io_sc.service_data->SetThresholdMaskId(0);
} while (0);

// Set it as SUE generation point.
io_sc.service_data->SetUERE();
return o_rc;

// Indicate that cleanup is required.
P8McsDataBundle * mcsdb = getMcsDataBundle( i_mcsChip );
ExtensibleChip * membChip = mcsdb->getMembChip();
if ( NULL == membChip )
{
PRDF_ERR( PRDF_FUNC "getMembChip() returned NULL" );
o_rc = FAIL; break;
}
MembufDataBundle * mbdb = getMembufDataBundle( membChip );
mbdb->iv_doChnlFailCleanup = true;
#undef PRDF_FUNC
}

template<>
uint32_t __queryChnlFail<TYPE_DMI>( ExtensibleChip * i_chip, bool & o_chnlFail )
{
// There is a HWP on the processor side that will query the CHIFIR, IOMCFIR,
// and associated configuration registers for a valid channel failure
// attention.
return PlatServices::queryChnlFail<TYPE_DMI>( i_chip, o_chnlFail );
}

//------------------------------------------------------------------------------

template<TARGETING::TYPE T>
void __setChnlFailCleanup( ExtensibleChip * i_chip );

template<>
void __setChnlFailCleanup<TYPE_MEMBUF>( ExtensibleChip * i_chip )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MEMBUF == i_chip->getType() );

getMembufDataBundle(i_chip)->iv_doChnlFailCleanup = true;
}

template<>
void __setChnlFailCleanup<TYPE_DMI>( ExtensibleChip * i_chip )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_DMI == i_chip->getType() );

ExtensibleChip * membChip = getConnectedChild( i_chip, TYPE_MEMBUF, 0 );
PRDF_ASSERT( nullptr != membChip ); // shouldn't be possible

__setChnlFailCleanup<TYPE_MEMBUF>( membChip );
}

//------------------------------------------------------------------------------

template<TARGETING::TYPE T>
uint32_t handleChnlFail( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( T == i_chip->getType() );

uint32_t o_rc = SUCCESS;

do
{
// Skip if already handling channel failure.
if ( io_sc.service_data->IsUnitCS() ) break;

// Skip if currently analyzing a host attention. This is a required for
// a rare scenario when a channel failure occurs after PRD is called to
// handle the host attention.
if ( HOST_ATTN == io_sc.service_data->getPrimaryAttnType() ) break;

// Look for the channel fail attention.
bool isChnlFail = false;
uint32_t o_rc = __queryChnlFail<T>( i_chip, isChnlFail );
if ( SUCCESS != o_rc ) break;

if ( ! isChnlFail ) break; // No channel fail, nothing more to do.

// Change the secondary attention type to UNIT_CS so the rule code will
// start looking for UNIT_CS attentions instead of recoverable.
io_sc.service_data->setSecondaryAttnType( UNIT_CS );

// Set the UNIT_CS flag in the SDC to indicate a channel failure has
// been detected and there is no need to check again.
io_sc.service_data->setFlag( ServiceDataCollector::UNIT_CS );

// Make the error log predictive and set threshold.
io_sc.service_data->setFlag( ServiceDataCollector::SERVICE_CALL );
io_sc.service_data->setFlag( ServiceDataCollector::AT_THRESHOLD );

// Channel failures will always send SUEs.
io_sc.service_data->setFlag( ServiceDataCollector::UERE );

// Indicate cleanup is required on this channel.
__setChnlFailCleanup<T>( i_chip );

} while (0);

if ( SUCCESS != o_rc )
return o_rc;
}

template
uint32_t handleChnlFail<TYPE_MEMBUF>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );
template
uint32_t handleChnlFail<TYPE_DMI>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

template<>
uint32_t handleChnlFail<TYPE_MC>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MC == i_chip->getType() );

uint32_t o_rc = SUCCESS;

for ( auto & dmiChip : getConnected(i_chip, TYPE_DMI) )
{
PRDF_ERR( PRDF_FUNC "Failed: i_mcsChip=0x%08x", i_mcsChip->GetId() );
o_rc = handleChnlFail<TYPE_DMI>( dmiChip, io_sc );
if ( SUCCESS != o_rc ) break;
}

return o_rc;
#undef PRDF_FUNC
}
*/

//------------------------------------------------------------------------------

Expand Down
28 changes: 19 additions & 9 deletions src/usr/diag/prdf/common/plat/mem/prdfMemUtils.H
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,6 @@ int32_t collectCeStats( ExtensibleChip * i_chip, const MemRank & i_rank,
template<TARGETING::TYPE T>
uint8_t getDramSize( ExtensibleChip * i_chip, uint8_t i_dimmSlct = 0 );

/**
* @brief Check for channel fail attentions on the MCS side of the DMI bus.
* @param i_mcsChip An MCS chip.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
int32_t checkMcsChannelFail( ExtensibleChip * i_mcsChip,
STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief determines the type of Centaur based raw card associated with MBA.
* @param i_mba mba target
Expand All @@ -140,6 +131,25 @@ int32_t getRawCardType( TARGETING::TargetHandle_t i_mba,
template<TARGETING::TYPE T>
void cleanupChnlAttns( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief Checks for channel fail attentions on the target side of the bus.
* @note If a channel fail attention is present, this function will set the
* secondary attention type to UNIT_CS, among other appropriate flags
* in the SDC. It is important that this is called in the PreAnalysis
* plugin so that PRD knows to look for UNIT_CS attentions instead of
* recoverable attentions.
* @note This only checks one side of the bus. It does not check both sides.
* @note If the secondary attention type is HOST_ATTN, the function exits and
* waits for the next attention. This is for the rare scenario where a
* channel failure occurs after PRD is called to handle a HOST_ATTN.
* @param i_chip MEMBUF, DMI, or MC chip.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
template<TARGETING::TYPE T>
uint32_t handleChnlFail( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

/**
* @brief After analyzing a memory channel failure, we want to mask off all
* possible attentions on that channel to ensure we don't get any errant
Expand Down
17 changes: 17 additions & 0 deletions src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@ namespace p9_dmi
//
//##############################################################################

/**
* @brief Analysis code that is called before the main analyze() function.
* @param i_chip A DMI chip.
* @param io_sc The step code data struct.
* @param o_analyzed True if analysis is done on this chip, false otherwise.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
int32_t PreAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
bool & o_analyzed )
{
o_analyzed = false;

// Check for a channel failure before analyzing this chip.
return MemUtils::handleChnlFail<TYPE_DMI>( i_chip, io_sc );
}
PRDF_PLUGIN_DEFINE( p9_dmi, PreAnalysis );

/**
* @brief Plugin function called after analysis is complete but before PRD
* exits.
Expand Down
17 changes: 17 additions & 0 deletions src/usr/diag/prdf/common/plat/p9/prdfP9Mc_common.C
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@ namespace p9_mc
//
//##############################################################################

/**
* @brief Analysis code that is called before the main analyze() function.
* @param i_chip An MC chip.
* @param io_sc The step code data struct.
* @param o_analyzed True if analysis is done on this chip, false otherwise.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
int32_t PreAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
bool & o_analyzed )
{
o_analyzed = false;

// Check for a channel failure before analyzing this chip.
return MemUtils::handleChnlFail<TYPE_MC>( i_chip, io_sc );
}
PRDF_PLUGIN_DEFINE( p9_mc, PreAnalysis );

/**
* @brief Plugin function called after analysis is complete but before PRD
* exits.
Expand Down

0 comments on commit 9192da4

Please sign in to comment.