Skip to content

Commit

Permalink
PRD: Prioritize centaur internal timeout over channel failure
Browse files Browse the repository at this point in the history
Change-Id: Idc9b57c28d48a9426bb8aee7ae4d17dac285e537
CQ: SW451358
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70277
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70569
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
zane131 committed Jan 18, 2019
1 parent 9fc690c commit 7ef75d2
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 63 deletions.
3 changes: 2 additions & 1 deletion src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
# Contributors Listed Below - COPYRIGHT 2016,2018
# Contributors Listed Below - COPYRIGHT 2016,2019
# [+] International Business Machines Corp.
#
#
Expand Down Expand Up @@ -33,6 +33,7 @@ chip centaur_membuf

# Import signatures
.include "prdfP9ProcMbCommonExtraSig.H";
.include "prdfCenMembufExtraSig.H";
.include "prdfLaneRepairExtraSig.H";

#############################################################################
Expand Down
38 changes: 38 additions & 0 deletions src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* IBM_PROLOG_BEGIN_TAG */
/* This is an automatically generated prolog. */
/* */
/* $Source: src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H $ */
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2013,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
/* You may obtain a copy of the License at */
/* */
/* http://www.apache.org/licenses/LICENSE-2.0 */
/* */
/* Unless required by applicable law or agreed to in writing, software */
/* distributed under the License is distributed on an "AS IS" BASIS, */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */
/* implied. See the License for the specific language governing */
/* permissions and limitations under the License. */
/* */
/* IBM_PROLOG_END_TAG */

#ifndef __prdfCenMembufExtraSig_H
#define __prdfCenMembufExtraSig_H

#include <prdrSignatures.H>
#include <prdfMemExtraSig.H>

PRDR_ERROR_SIGNATURE( InternalTimeout, 0xbbbb0000, "(MBSFIR[4])",
"INTERNAL_TIMEOUT" );

#endif // __prdfCenMembufExtraSig_H



Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
# Contributors Listed Below - COPYRIGHT 2017,2018
# Contributors Listed Below - COPYRIGHT 2017,2019
# [+] International Business Machines Corp.
#
#
Expand Down Expand Up @@ -49,8 +49,7 @@ actionclass dmi_bus_th_1_UERE
actionclass dsffChannelTimeout_UERE
{
SueSource;
threshold1;
funccall("dsffChannelTimeoutCheck");
self_th_1;
};

################################################################################
Expand Down
76 changes: 74 additions & 2 deletions src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2013,2018 */
/* Contributors Listed Below - COPYRIGHT 2013,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -37,6 +37,7 @@
// Platform includes
#include <prdfCenMbaDataBundle.H>
#include <prdfCenMembufDataBundle.H>
#include <prdfCenMembufExtraSig.H>
#include <prdfMemSymbol.H>
#include <prdfParserUtils.H>
#include <prdfPlatServices.H>
Expand Down Expand Up @@ -917,6 +918,69 @@ bool __analyzeRcdParityError<TYPE_MEMBUF>( ExtensibleChip * i_chip,

//------------------------------------------------------------------------------

// Channel failure analysis is designed to only look for UNIT_CS attentions and
// not associate any recoverables as the root cause. Of course, now we have yet
// another special case. An internal timeout is a recoverable attention that
// could cause unit CS attentions as a side effect. Therefore, we must analyze
// it first before looking for any UNIT_CS attentions.

template<TARGETING::TYPE T>
bool __analyzeInternalTimeout( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );

template<>
bool __analyzeInternalTimeout<TYPE_MEMBUF>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemUtils::__analyzeInternalTimeout] "

PRDF_ASSERT( nullptr != i_chip );
PRDF_ASSERT( TYPE_MEMBUF == i_chip->getType() );

uint32_t o_analyzed = false;

SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "MBSFIR" );
SCAN_COMM_REGISTER_CLASS * mask = i_chip->getRegister( "MBSFIR_MASK" );

do
{
if ( SUCCESS != (fir->Read() | mask->Read()) )
{
PRDF_ERR( PRDF_FUNC "Failed to read MBSFIRs on 0x%08x",
i_chip->getHuid() );
break;
}

// If there is an internal timeout that is not masked and there is not
// an external timeout (note external timeout is always masked), then
// there is a legit internal timeout attention.
if ( fir->IsBitSet(4) && !mask->IsBitSet(4) && !fir->IsBitSet(3) )
{
// We are not going to analyze the MEMBUF chip like we do with some
// of the other helper functions in this file because the rule code
// priority will put the MBSFIR after the MBIFIR and DMIFIR.
// Therefore, there is no way to guarantee this attention will be
// analyzed. Since we do know there is a channel failure we can
// simply make a predictive callout because the channel failure code
// will eventually mask the entire Centaur.

io_sc.service_data->SetCallout( i_chip->getTrgt() );

io_sc.service_data->setSignature( i_chip->getHuid(),
PRDFSIG_InternalTimeout );

o_analyzed = true; break; // analysis complete
}

} while (0);

return o_analyzed;

#undef PRDF_FUNC
}

//------------------------------------------------------------------------------

// Handling channel failures from more than one channel at a time:
// Say we were called to handle a recoverable attention on a Centaur, but the
// channel containing that Centaur has a unit checkstop attention in the
Expand Down Expand Up @@ -965,12 +1029,20 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip,
}

// First, check for RCD parity errors. They are recoverable attentions
// that could has a channel failure attention as a side effect.
// that could have a channel failure attention as a side effect.
if ( __analyzeRcdParityError<TYPE_MEMBUF>(membChip, io_sc) )
{
o_analyzed = true; break; // analysis complete
}

// Now, check for an internal timeout error. This is a recoverable
// attention that could have a channel failure attention as a side
// effect.
if ( __analyzeInternalTimeout<TYPE_MEMBUF>(membChip, io_sc) )
{
o_analyzed = true; break; // analysis complete
}

// Now, look for unit checkstops in the CHIFIR, excluding
// CHIFIR[16,19:21,61].
if ( __queryUcsChifir<TYPE_DMI>(dmiChip) )
Expand Down
57 changes: 0 additions & 57 deletions src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
Original file line number Diff line number Diff line change
Expand Up @@ -88,63 +88,6 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
}
PRDF_PLUGIN_DEFINE( cumulus_dmi, PostAnalysis );

//##############################################################################
//
// CHIFIR
//
//##############################################################################
/**
* @brief Checks if we have a legitimate CHIFIR[61] channel timeout or if its
* a side effect of a MBSFIR[4] internal timeout.
* @param i_dmiChip DMI chip.
* @param io_sc Step code data struct
* @return SUCCESS if MBSFIR[4] is set but MBSFIR[3] is not.
* PRD_SCAN_COMM_REGISTER_ZERO otherwise.
*/
int32_t dsffChannelTimeoutCheck( ExtensibleChip * i_dmiChip,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[dsffChannelTimeoutCheck] "

int32_t o_rc = SUCCESS;

ExtensibleChip * membChip = getConnectedChild( i_dmiChip, TYPE_MEMBUF, 0 );
PRDF_ASSERT( nullptr != membChip );

do
{
// Get MBSFIR
SCAN_COMM_REGISTER_CLASS * mbsFir = membChip->getRegister("MBSFIR");

o_rc = mbsFir->Read();
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x",
membChip->getHuid() );
break;
}

// If MBSFIR[4] is set and MBSFIR[3] is not set
if( mbsFir->IsBitSet(4) && !mbsFir->IsBitSet(3) )
{
// MBSFIR[4] internal timeout, predictive centaur callout
io_sc.service_data->SetCallout( membChip->getTrgt() );
}
else
{
// CHIFIR[61] channel timeout, predictive DMI callout
io_sc.service_data->SetCallout( i_dmiChip->getTrgt() );
}

}while(0);

return o_rc;

#undef PRDF_FUNC
}
PRDF_PLUGIN_DEFINE( cumulus_dmi, dsffChannelTimeoutCheck );

//------------------------------------------------------------------------------

} // end namespace cumulus_dmi
Expand Down

0 comments on commit 7ef75d2

Please sign in to comment.