Skip to content

Commit

Permalink
PRD: better isolation for RCD parity errors and channel failures
Browse files Browse the repository at this point in the history
Change-Id: Ied9cc95e8eca42cdd7919ffa0a35a9d2e2a4d708
CQ: SW443190
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/65684
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/65904
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed Sep 11, 2018
1 parent 09370fc commit 60d11f6
Show file tree
Hide file tree
Showing 6 changed files with 349 additions and 295 deletions.
104 changes: 2 additions & 102 deletions src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C
Original file line number Diff line number Diff line change
Expand Up @@ -73,110 +73,10 @@ PRDF_PLUGIN_DEFINE( cen_centaur, Initialize );
int32_t PreAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
bool & o_analyzed )
{
#define PRDF_FUNC "[cen_centaur::PreAnalysis] "

o_analyzed = false;

SCAN_COMM_REGISTER_CLASS * fir = nullptr;
SCAN_COMM_REGISTER_CLASS * mask = nullptr;
SCAN_COMM_REGISTER_CLASS * act0 = nullptr;
SCAN_COMM_REGISTER_CLASS * act1 = nullptr;

ATTENTION_TYPE secAttnType = io_sc.service_data->getSecondaryAttnType();

do
{
// There is nothing to do for HOST_ATTNs.
if ( HOST_ATTN == secAttnType ) break;

// Channel failure analysis is designed to only look for UNIT_CS
// attentions and not associate any recoverables as the root cause. Of
// course, now we have a special case. RCD parity errors are recoverable
// attentions that could cause unit CS attentions as a side effect.
// Therefore, we must analyze them first before doing any channel
// failure checking.
if ( RECOVERABLE == secAttnType )
{
for ( auto & mbaChip : getConnected(i_chip, TYPE_MBA) )
{
fir = mbaChip->getRegister( "MBACALFIR" );
mask = mbaChip->getRegister( "MBACALFIR_MASK" );

if ( SUCCESS != (fir->Read() | mask->Read()) )
{
PRDF_ERR( PRDF_FUNC "Failed to read MBACALFIRs on 0x%08x",
mbaChip->getHuid() );
continue; // try the other MBA
}

if ( (fir->IsBitSet(4) && !mask->IsBitSet(4)) ||
(fir->IsBitSet(7) && !mask->IsBitSet(7)) )
{
PRDF_INF( PRDF_FUNC "RCD parity error found on 0x%08x",
mbaChip->getHuid() );

if ( SUCCESS == mbaChip->Analyze(io_sc, secAttnType) )
{
o_analyzed = true;
break; // analysis complete
}
}
}
if ( o_analyzed ) break; // nothing more to do
}

// Now, check for the presences of channel failures on the Centaur.
if ( SUCCESS != MemUtils::handleChnlFail<TYPE_MEMBUF>(i_chip, io_sc) )
{
PRDF_ERR( PRDF_FUNC "handleChnlFail(0x%08x) failed",
i_chip->getHuid() );
}

// If there is a channel failure on the Centaur, it is possible that it
// may be a side-effect of a channel failure attention from the CHIFIR
// on the other side of the bus. Therefore, we must check for any active
// UNIT_CS attentions from the CHIFIR. If so, analyze the DMI target.
if ( io_sc.service_data->isMemChnlFail() )
{
ExtensibleChip * dmiChip = getConnectedParent( i_chip, TYPE_DMI );

fir = dmiChip->getRegister( "CHIFIR" );
mask = dmiChip->getRegister( "CHIFIR_MASK" );
act0 = dmiChip->getRegister( "CHIFIR_ACT0" );
act1 = dmiChip->getRegister( "CHIFIR_ACT1" );

if ( SUCCESS != (fir->Read() | mask->Read() |
act0->Read() | act1->Read()) )
{
PRDF_ERR( PRDF_FUNC "Failed to read CHIFIRs on 0x%08x",
dmiChip->getHuid() );
break;
}

// Make sure to ignore CHIFIR[16:21], which simply say there is an
// attention on the Centaur. Otherwise, we will get stuck in a loop.
if ( 0 != ( fir->GetBitFieldJustified( 0,64) &
~mask->GetBitFieldJustified(0,64) &
act0->GetBitFieldJustified(0,64) &
act1->GetBitFieldJustified(0,64) &
0xffff03ffffffffffull ) )
{
PRDF_INF( PRDF_FUNC "CHIFIR UNIT_CS attns present on 0x%08x",
dmiChip->getHuid() );

if ( SUCCESS == dmiChip->Analyze(io_sc, secAttnType) )
{
o_analyzed = true;
break; // analysis complete
}
}
}

} while (0);
// Check for a channel failure before analyzing this chip.
o_analyzed = MemUtils::analyzeChnlFail<TYPE_MEMBUF>( i_chip, io_sc );

return SUCCESS;

#undef PRDF_FUNC
}
PRDF_PLUGIN_DEFINE( cen_centaur, PreAnalysis );

Expand Down

0 comments on commit 60d11f6

Please sign in to comment.