Skip to content

Commit

Permalink
PRD: reorder isolation for channel fail attentions
Browse files Browse the repository at this point in the history
Before analyzing the MBIFIR. We now check for UNIT_CS attentions on the
CHIFIR. If any exist, analyze the CHIFIR first.

Change-Id: I1d4f38a75c1f26dd13d76e894b96bdd60c520774
CQ: SW438676
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/62865
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/63090
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed Jul 21, 2018
1 parent 5268e2f commit a908d83
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 57 deletions.
11 changes: 2 additions & 9 deletions src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule
Original file line number Diff line number Diff line change
Expand Up @@ -125,20 +125,13 @@ actionclass tooManyBusErrors_dmibus_UERE
SueSource; # channel failure
};

/** Callout the DMI bus, threshold 1. Also, clear secondary MBSFIR bits. */
actionclass replayTimeoutCleanup
/** Calls out the DMI bus (TH 1) and clears any secondary attentions. */
actionclass replay_timeout_UERE
{
calloutBusInterface_dmibus_th1_UERE;
funccall("ClearMbsSecondaryBits");
};

/** Handles channel failures on the DMI side of the bus, if present. Otherwise,
* calls out self (TH 1). */
actionclass replay_timeout_UERE
{
try ( funccall("analyzeDmiChnlFail"), replayTimeoutCleanup );
};

/** Handles RCD parity errors, if present. Otherwise, calls out self (TH 1). */
actionclass mbs_internal_timeout
{
Expand Down
2 changes: 2 additions & 0 deletions src/usr/diag/prdf/common/plat/cen/cen_mba.rule
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ rule rMBA
summary( 4, rMBASPA );
};

# The MBACALFIR must be analyzed first so that the RCD parity errors are the
# first to be analyzed.
group gMBA attntype UNIT_CS, RECOVERABLE, HOST_ATTN filter singlebit
{
(rMBA, bit(0)) ? analyze(gMBACALFIR);
Expand Down
149 changes: 102 additions & 47 deletions src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,110 @@ PRDF_PLUGIN_DEFINE( cen_centaur, Initialize );
int32_t PreAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc,
bool & o_analyzed )
{
#define PRDF_FUNC "[cen_centaur::PreAnalysis] "

o_analyzed = false;

// Check for a channel failure before analyzing this chip.
return MemUtils::handleChnlFail<TYPE_MEMBUF>( i_chip, io_sc );
SCAN_COMM_REGISTER_CLASS * fir = nullptr;
SCAN_COMM_REGISTER_CLASS * mask = nullptr;
SCAN_COMM_REGISTER_CLASS * act0 = nullptr;
SCAN_COMM_REGISTER_CLASS * act1 = nullptr;

ATTENTION_TYPE secAttnType = io_sc.service_data->getSecondaryAttnType();

do
{
// There is nothing to do for HOST_ATTNs.
if ( HOST_ATTN == secAttnType ) break;

// Channel failure analysis is designed to only look for UNIT_CS
// attentions and not associate any recoverables as the root cause. Of
// course, now we have a special case. RCD parity errors are recoverable
// attentions that could cause unit CS attentions as a side effect.
// Therefore, we must analyze them first before doing any channel
// failure checking.
if ( RECOVERABLE == secAttnType )
{
for ( auto & mbaChip : getConnected(i_chip, TYPE_MBA) )
{
fir = mbaChip->getRegister( "MBACALFIR" );
mask = mbaChip->getRegister( "MBACALFIR_MASK" );

if ( SUCCESS != (fir->Read() | mask->Read()) )
{
PRDF_ERR( PRDF_FUNC "Failed to read MBACALFIRs on 0x%08x",
mbaChip->getHuid() );
continue; // try the other MBA
}

if ( (fir->IsBitSet(4) && !mask->IsBitSet(4)) ||
(fir->IsBitSet(7) && !mask->IsBitSet(7)) )
{
PRDF_INF( PRDF_FUNC "RCD parity error found on 0x%08x",
mbaChip->getHuid() );

if ( SUCCESS == mbaChip->Analyze(io_sc, secAttnType) )
{
o_analyzed = true;
break; // analysis complete
}
}
}
if ( o_analyzed ) break; // nothing more to do
}

// Now, check for the presences of channel failures on the Centaur.
if ( SUCCESS != MemUtils::handleChnlFail<TYPE_MEMBUF>(i_chip, io_sc) )
{
PRDF_ERR( PRDF_FUNC "handleChnlFail(0x%08x) failed",
i_chip->getHuid() );
}

// If there is a channel failure on the Centaur, it is possible that it
// may be a side-effect of a channel failure attention from the CHIFIR
// on the other side of the bus. Therefore, we must check for any active
// UNIT_CS attentions from the CHIFIR. If so, analyze the DMI target.
if ( io_sc.service_data->isMemChnlFail() )
{
ExtensibleChip * dmiChip = getConnectedParent( i_chip, TYPE_DMI );

fir = dmiChip->getRegister( "CHIFIR" );
mask = dmiChip->getRegister( "CHIFIR_MASK" );
act0 = dmiChip->getRegister( "CHIFIR_ACT0" );
act1 = dmiChip->getRegister( "CHIFIR_ACT1" );

if ( SUCCESS != (fir->Read() | mask->Read() |
act0->Read() | act1->Read()) )
{
PRDF_ERR( PRDF_FUNC "Failed to read CHIFIRs on 0x%08x",
dmiChip->getHuid() );
break;
}

// Make sure to ignore CHIFIR[16:21], which simply say there is an
// attention on the Centaur. Otherwise, we will get stuck in a loop.
if ( 0 != ( fir->GetBitFieldJustified( 0,64) &
~mask->GetBitFieldJustified(0,64) &
act0->GetBitFieldJustified(0,64) &
act1->GetBitFieldJustified(0,64) &
0xffff03ffffffffffull ) )
{
PRDF_INF( PRDF_FUNC "CHIFIR UNIT_CS attns present on 0x%08x",
dmiChip->getHuid() );

if ( SUCCESS == dmiChip->Analyze(io_sc, secAttnType) )
{
o_analyzed = true;
break; // analysis complete
}
}
}

} while (0);

return SUCCESS;

#undef PRDF_FUNC
}
PRDF_PLUGIN_DEFINE( cen_centaur, PreAnalysis );

Expand Down Expand Up @@ -163,51 +263,6 @@ int32_t CheckForUnitCs( ExtensibleChip * i_chip, bool & o_hasAttns )
//
//##############################################################################

/**
* @brief Calls analyze() on the connected DMI target if there is an active
* channel fail attention on the DMI side of the bus.
* @param i_mbChip MEMBUF chip.
* @param io_sc Step code data struct
* @return SUCCESS if the channel fail error was present and analyzed properly.
* Non-SUCCESS otherwise.
*/
int32_t analyzeDmiChnlFail( ExtensibleChip * i_mbChip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[analyzeDmiChnlFail] "

int32_t o_rc = PRD_SCAN_COMM_REGISTER_ZERO; // default, nothing found

do
{
// Query the connected DMI for channel fail attentions.
ExtensibleChip * dmiChip = getConnectedParent( i_mbChip, TYPE_DMI );
bool dmiChnlFail = false;
if ( SUCCESS != queryChnlFail<TYPE_DMI>(dmiChip, dmiChnlFail) )
{
PRDF_ERR( PRDF_FUNC "queryChnlFail(0x%08x) failed",
dmiChip->getHuid() );
break;
}

// If there is a channel fail attention on the other side of the bus,
// analyze the DMI target.
if ( dmiChnlFail )
{
o_rc = dmiChip->Analyze( io_sc,
io_sc.service_data->getSecondaryAttnType() );
}

} while (0);

return o_rc;

#undef PRDF_FUNC
}
PRDF_PLUGIN_DEFINE( cen_centaur, analyzeDmiChnlFail );

//------------------------------------------------------------------------------

/**
* @brief Calls analyze() on the target MBA if there is an active RCD parity
* error.
Expand Down
13 changes: 12 additions & 1 deletion src/usr/diag/prdf/common/plat/p9/p9_dmi.rule
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,18 @@ rule rCHIFIR
CHIFIR & ~CHIFIR_MASK & CHIFIR_ACT0 & CHIFIR_ACT1;
};

group gCHIFIR filter singlebit, cs_root_cause( 0, 2, 4, 5, 6, 12, 14, 15, 16, 32, 36, 40, 41, 42, 43, 46, 61 )
# Note that CHIFIR[16:21] indicate there was an attention on the Centaur. All
# bits in this FIR must be prioritized over CHIFIR[16:21]. Otherwise, we may
# get stuck in a loop on some conditions.
group gCHIFIR filter priority( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10,11,12,13,14,15,
22,23,24,25,26,27,28,29,
30,31,32,33,34,35,36,37,38,39,
40,41,42,43,44,45,46,47,48,49,
50,51,52,53,54,55,56,57,58,59,
60,61,62,63),
cs_root_cause( 0, 2, 4, 5, 6, 12, 14, 15, 16, 32, 36, 40,
41, 42, 43, 46, 61 )
{
/** CHIFIR[0]
* PE on internal register
Expand Down

0 comments on commit a908d83

Please sign in to comment.