Skip to content

Commit

Permalink
Updates error paths for PRD FIR checking
Browse files Browse the repository at this point in the history
FIR's could cause errors within hardware procedures. PRD has
the capability to retrigger a procedure if it sees an error.
We might be able to avoid IPL issues with this, so if a FIR
has been hit during hardware enabled code (CCS or calibration),
then log the error and let PRD find the "new" FIR that could have
caused the hardware engine to have an issue.  If there is some other
problem, the retriggered HWP will find it.

Change-Id: I81599d1d0c4b4c256b79820b4a7e2eafc09e206b
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46571
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: JACOB L. HARVEY <jlharvey@us.ibm.com>
Reviewed-by: Louis Stermole <stermole@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: HWSV CI <hwsv-ci+hostboot@us.ibm.com>
Reviewed-by: ANDRE A. MARIN <aamarin@us.ibm.com>
Tested-by: Hostboot CI <hostboot-ci+hostboot@us.ibm.com>
Reviewed-by: Jennifer A. Stofer <stofer@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46584
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
sglancy6 authored and dcrowell77 committed Oct 3, 2017
1 parent f21a18e commit 3890040
Show file tree
Hide file tree
Showing 13 changed files with 432 additions and 26 deletions.
7 changes: 6 additions & 1 deletion src/import/chips/p9/procedures/hwp/memory/lib/ccs/ccs.C
Expand Up @@ -37,6 +37,7 @@

#include <mss.H>
#include <lib/ccs/ccs.H>
#include <lib/fir/check.H>

using fapi2::TARGET_TYPE_MCBIST;
using fapi2::TARGET_TYPE_MCA;
Expand Down Expand Up @@ -83,6 +84,7 @@ fapi2::ReturnCode fail_type( const fapi2::Target<TARGET_TYPE_MCBIST>& i_target,
const uint64_t& i_type,
const fapi2::Target<TARGET_TYPE_MCA>& i_mca )
{
fapi2::ReturnCode l_failing_rc(fapi2::FAPI2_RC_SUCCESS);
// Including the MCA_TARGET here and below at CAL_TIMEOUT since these problems likely lie at the MCA level
// So we disable the PORT and hopefully that's it
// If the problem lies with the MCBIST, it'll just have to loop
Expand Down Expand Up @@ -112,7 +114,10 @@ fapi2::ReturnCode fail_type( const fapi2::Target<TARGET_TYPE_MCBIST>& i_target,
fapi2::MSS_CCS_HUNG().set_MCBIST_TARGET(i_target),
"%s CCS appears hung", mss::c_str(i_target));
fapi_try_exit:
return fapi2::current_err;
// Due to the PRD update, we need to check for FIR's
// If any FIR's have lit up, this CCS fail could have been caused by the FIR
// So, let PRD retrigger this step to see if we can resolve the issue
return mss::check::fir_or_pll_fail(i_target, fapi2::current_err);
}

///
Expand Down
Expand Up @@ -64,7 +64,7 @@ fapi2::ReturnCode mrs_engine( const fapi2::Target<fapi2::TARGET_TYPE_DIMM>& i_ta
const uint64_t i_rank,
std::vector< ccs::instruction_t<fapi2::TARGET_TYPE_MCBIST> >& io_inst )
{
FAPI_TRY( mrs_engine(i_target, i_data, i_rank, i_data.iv_delay, io_inst) );
FAPI_TRY( mrs_engine(i_target, i_data, i_rank, i_data.iv_delay, io_inst) );

fapi_try_exit:
return fapi2::current_err;
Expand Down
16 changes: 7 additions & 9 deletions src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.C
Expand Up @@ -233,7 +233,6 @@ fapi_try_exit:

///
/// @brief Return a vector of rank numbers which represent the primary rank pairs for this port
/// @tparam T the target type
/// @param[in] i_target TARGET_TYPE_MCA
/// @param[out] o_rps a vector of rank_pairs
/// @return FAPI2_RC_SUCCESS iff all is ok
Expand All @@ -251,7 +250,7 @@ fapi2::ReturnCode primary_ranks( const fapi2::Target<TARGET_TYPE_MCA>& i_target,
FAPI_TRY( mss::eff_num_master_ranks_per_dimm(d, l_rank_count[mss::index(d)]) );
}

FAPI_DBG("ranks: %d, %d", l_rank_count[0], l_rank_count[1]);
FAPI_DBG("%s ranks: %d, %d", mss::c_str(i_target), l_rank_count[0], l_rank_count[1]);

// Walk through rank pair table and skip empty pairs
o_rps.clear();
Expand All @@ -264,13 +263,15 @@ fapi2::ReturnCode primary_ranks( const fapi2::Target<TARGET_TYPE_MCA>& i_target,
}
}

// Returning success in case no DIMM's are configured
return fapi2::FAPI2_RC_SUCCESS;

fapi_try_exit:
return fapi2::current_err;
}

///
/// @brief Return a vector of rank numbers which represent the primary rank pairs for this dimm
/// @tparam T the target type
/// @param[in] i_target TARGET_TYPE_DIMM
/// @param[out] o_rps a vector of rank_pairs
/// @return FAPI2_RC_SUCCESS iff all is ok
Expand Down Expand Up @@ -344,7 +345,6 @@ fapi_try_exit:

///
/// @brief Given a target, get the rank pair assignments, based on DIMMs
/// @tparam T the fapi2::TargetType
/// @param[in] i_target the target (MCA or MBA?)
/// @param[out] o_registers the regiter settings for the appropriate rank pairs
/// @return FAPI2_RC_SUCCESS if and only if ok
Expand Down Expand Up @@ -382,8 +382,7 @@ fapi_try_exit:

///
/// @brief Setup the rank information in the port
/// @tparam T the fapi2::TargetType
/// @param[in] i_target the target (MCA or MBA?)
/// @param[in] i_target the target (MCA)
/// @return FAPI2_RC_SUCCESS if and only if ok
///
template<>
Expand Down Expand Up @@ -485,7 +484,6 @@ fapi_try_exit:
///
/// @brief Get a vector of configured rank pairs.
/// Returns a vector of ordinal values of the configured rank pairs. e.g., for a 2R DIMM, {0, 1}
/// @tparam T the fapi2::TargetType
/// @param[in]i_target the target (MCA or MBA?)
/// @param[out] o_pairs std::vector of rank pairs configured
/// @return FAPI2_RC_SUCCESS if and only if ok
Expand Down Expand Up @@ -565,15 +563,15 @@ fapi_try_exit:
///
/// @brief Get a rank-pair id from a physical rank
/// Returns a number representing which rank-pair this rank is a part of
/// @tparam T the fapi2::TargetType
/// @param[in] i_target the target (MCA or MBA?)
/// @param[in] i_rank the physical rank number
/// @param[out] o_pairs the rank pair
/// @return FAPI2_RC_SUCCESS if and only if ok, FAPI2_RC_INVALID_PARAMETER if the rank isn't found
///
template<>
fapi2::ReturnCode get_pair_from_rank(const fapi2::Target<TARGET_TYPE_MCA>& i_target,
uint64_t i_rank, uint64_t& o_pair)
uint64_t i_rank,
uint64_t& o_pair)
{
// Sort of brute-force, but no real good other way to do it. Given the
// rank-pair configuration we walk the config looking for our rank, and
Expand Down
4 changes: 2 additions & 2 deletions src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H
Expand Up @@ -1060,7 +1060,7 @@ inline fapi2::ReturnCode set_pair_valid( const fapi2::Target<T>& i_target,
fapi2::MSS_INVALID_RANK()
.set_RANK(i_rank)
.set_MCA_TARGET(i_target)
.set_FUNCTION(GET_RANKS_IN_PAIR),
.set_FUNCTION(SET_PAIR_VALID),
"%s Invalid rank (%d) in get_ranks_in_pair",
mss::c_str(i_target),
i_rank);
Expand Down Expand Up @@ -1231,7 +1231,7 @@ fapi2::ReturnCode get_ranks_in_pair( const fapi2::Target<T>& i_target,
// Get data
for (uint64_t l_ordinal = 0; l_ordinal < TT::NUM_RANKS_IN_PAIR; ++l_ordinal)
{
// Check to make sure rank is vlaid
// Check to make sure rank is valid
FAPI_ASSERT( l_ordinal < MAX_RANK_PER_DIMM,
fapi2::MSS_INVALID_RANK()
.set_RANK(l_ordinal)
Expand Down
202 changes: 202 additions & 0 deletions src/import/chips/p9/procedures/hwp/memory/lib/fir/check.C
Expand Up @@ -36,6 +36,8 @@
#include <fapi2.H>
#include <p9_mc_scom_addresses.H>
#include <p9_mc_scom_addresses_fld.H>
#include <p9_perv_scom_addresses.H>
#include <p9_perv_scom_addresses_fld.H>

#include <generic/memory/lib/utils/scom.H>
#include <lib/fir/fir.H>
Expand Down Expand Up @@ -205,6 +207,9 @@ fapi2::ReturnCode during_draminit_training( const fapi2::Target<fapi2::TARGET_TY
fapi2::buffer<uint64_t> l_phyfir_data;
fapi2::buffer<uint64_t> l_phyfir_masked;

// If we have a FIR that is lit up, we want to see if it could have been caused by a more drastic FIR
bool l_check_fir = false;

FAPI_TRY( mss::getScom(l_mca, MCA_IOM_PHY0_DDRPHY_FIR_REG, l_phyfir_data) );

l_phyfir_masked = l_phyfir_data & l_phyfir_mask;
Expand All @@ -213,6 +218,8 @@ fapi2::ReturnCode during_draminit_training( const fapi2::Target<fapi2::TARGET_TY
// We'll have the error log to know what fir bit triggered and when, so we should be fine clearing here
FAPI_TRY( mss::putScom(l_mca, MCA_IOM_PHY0_DDRPHY_FIR_REG_AND, l_phyfir_mask.invert()) );

// Check the FIR here
l_check_fir = true;
FAPI_ASSERT( l_phyfir_masked == 0,
fapi2::MSS_DRAMINIT_TRAINING_PORT_FIR()
.set_PHY_FIR(l_phyfir_masked)
Expand All @@ -221,9 +228,204 @@ fapi2::ReturnCode during_draminit_training( const fapi2::Target<fapi2::TARGET_TY
"Initial CAL failed: Reporting FIR bits set for %s ( phy: 0x%016lx",
mss::c_str(i_target), l_phyfir_masked);

fapi_try_exit:

// Handle any fails seen above accordingly
return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_check_fir);
}

// Declares FIR registers that are re-used between multiple functions
// Vectors of FIR and mask registers to read through
// As check_fir can be called in multiple places, we don't know what the mask may hold
// In order to tell if a FIR is legit or not, we read the FIR and check it against the mask reg
// Note: using a vector here in case we need to expand
static const std::vector<std::pair<uint64_t, uint64_t>> MCBIST_FIR_REGS =
{
// MCBIST FIR
{MCBIST_MCBISTFIRQ, MCBIST_MCBISTFIRMASK},
};

static const std::vector<std::pair<uint64_t, uint64_t>> MCA_FIR_REGS =
{
// MCA ECC FIR
{MCA_FIR, MCA_MASK},
// MCA CAL FIR
{MCA_MBACALFIRQ, MCA_MBACALFIR_MASK},
// DDRPHY FIR
{MCA_IOM_PHY0_DDRPHY_FIR_REG, MCA_IOM_PHY0_DDRPHY_FIR_MASK_REG},
};

///
/// @brief Checks whether any of the PLL unlock values are set
/// @param[in] i_local_fir - the overall FIR register
/// @param[in] i_perv_fir - the pervasive PLL FIR
/// @param[in] i_mc_fir - the memory controller FIR
/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok
///
bool pll_unlock( const fapi2::buffer<uint64_t>& i_local_fir,
const fapi2::buffer<uint64_t>& i_perv_fir,
const fapi2::buffer<uint64_t>& i_mc_fir )
{
// Note: the following registers did not have the scom fields defined, so we're constexpr'ing them here
constexpr uint64_t PERV_TP_ERROR_START = 25;
constexpr uint64_t PERV_TP_ERROR_LEN = 4;
constexpr uint64_t PERV_MC_ERROR_START = 25;

// No overall FIR (bit 21) was set, so just exit
if(!i_local_fir.getBit<PERV_1_LOCAL_FIR_IN21>())
{
FAPI_INF("Did not have the PERV_LOCAL_FIR bit set. No PLL error, exiting");
return false;
}

// Now, identify whether a PLL unlock caused the FIR bit to fail
FAPI_INF("PERV_TP_ERROR_REG %s PERV_MC01_ERROR_REG %s",
i_perv_fir.getBit<PERV_TP_ERROR_START, PERV_TP_ERROR_LEN>() ? "PLL lock fail" : "PLL ok",
i_mc_fir.getBit<PERV_MC_ERROR_START>() ? "PLL lock fail" : "PLL ok");

// We have a PLL unlock if the MC PLL unlock FIR bit is on or any of the TP PLL unlock bits are on
return (i_mc_fir.getBit<PERV_MC_ERROR_START>()) || (i_perv_fir.getBit<PERV_TP_ERROR_START, PERV_TP_ERROR_LEN>());
}

///
/// @brief Checks whether any PLL FIRs have been set on a target
/// @param[in] i_target - the target on which to operate
/// @param[out] o_fir_error - true iff a FIR was hit
/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok
///
fapi2::ReturnCode pll_fir( const fapi2::Target<fapi2::TARGET_TYPE_MCBIST>& i_target, bool& o_fir_error )
{
// Sets o_fir_error to false to begin with, just in case we have scom issues
o_fir_error = false;

// Gets the processor target
const auto& l_proc = mss::find_target<fapi2::TARGET_TYPE_PROC_CHIP>(i_target);

// Gets the register data
fapi2::buffer<uint64_t> l_local_fir;
fapi2::buffer<uint64_t> l_perv_fir;
fapi2::buffer<uint64_t> l_mc_fir;

FAPI_TRY(mss::getScom(l_proc, PERV_TP_LOCAL_FIR, l_local_fir), "%s failed to get 0x%016llx", mss::c_str(i_target),
PERV_TP_LOCAL_FIR);
FAPI_TRY(mss::getScom(l_proc, PERV_TP_ERROR_REG, l_perv_fir), "%s failed to get 0x%016llx", mss::c_str(i_target),
PERV_TP_ERROR_REG);
FAPI_TRY(mss::getScom(i_target, PERV_MC01_ERROR_REG, l_mc_fir), "%s failed to get 0x%016llx", mss::c_str(i_target),
PERV_MC01_ERROR_REG);

// Checks the data
o_fir_error = pll_unlock(l_local_fir, l_perv_fir, l_mc_fir);

fapi_try_exit:
return fapi2::current_err;
}

///
/// @brief Checks whether any FIR have lit up
/// @param[in] i_target - the target on which to operate - MCBIST specialization
/// @param[out] o_fir_error - true iff a FIR was hit
/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok
///
template< >
fapi2::ReturnCode bad_fir_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCBIST>& i_target, bool& o_fir_error )
{
// Start by assuming we do not have a FIR
o_fir_error = false;

// Loop, check the scoms, and check the FIR
// Note: we return out if any FIR is bad
for(const auto& l_fir_reg : MCBIST_FIR_REGS)
{
FAPI_TRY(fir_with_mask(i_target, l_fir_reg, o_fir_error));

// Exit if we found a FIR
if(o_fir_error)
{
return fapi2::FAPI2_RC_SUCCESS;
}
}

// Loop through all MCA's and all MCA FIR's
for(const auto& l_mca : mss::find_targets<fapi2::TARGET_TYPE_MCA>(i_target))
{
for(const auto& l_fir_reg : MCA_FIR_REGS)
{
FAPI_TRY(fir_with_mask(l_mca, l_fir_reg, o_fir_error));

// Exit if we found a FIR
if(o_fir_error)
{
return fapi2::FAPI2_RC_SUCCESS;
}
}
}

// Lastly, check for PLL unlocks
FAPI_TRY(pll_fir(i_target, o_fir_error));

fapi_try_exit:
return fapi2::current_err;
}


///
/// @brief Checks whether any FIR have lit up
/// @param[in] i_target - the target on which to operate - MCA specialization
/// @param[out] o_fir_error - true iff a FIR was hit
/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok
///
template< >
fapi2::ReturnCode bad_fir_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target, bool& o_fir_error )
{
const auto& l_mcbist = mss::find_target<fapi2::TARGET_TYPE_MCBIST>(i_target);
// Start by assuming we do not have a FIR
o_fir_error = false;

// Loop, check the scoms, and check the FIR
// Note: we return out if any FIR is bad
for(const auto& l_fir_reg : MCBIST_FIR_REGS)
{
FAPI_TRY(fir_with_mask(l_mcbist, l_fir_reg, o_fir_error));

// Exit if we found a FIR
if(o_fir_error)
{
return fapi2::FAPI2_RC_SUCCESS;
}
}

// Loop through all MCA FIR's
for(const auto& l_fir_reg : MCA_FIR_REGS)
{
FAPI_TRY(fir_with_mask(i_target, l_fir_reg, o_fir_error));

// Exit if we found a FIR
if(o_fir_error)
{
return fapi2::FAPI2_RC_SUCCESS;
}
}

// Lastly, check for PLL unlocks
FAPI_TRY(pll_fir(l_mcbist, o_fir_error));

fapi_try_exit:
return fapi2::current_err;
}


///
/// @brief Checks whether any FIR have lit up
/// @param[in] i_target - the target on which to operate - DIMM specialization
/// @param[out] o_fir_error - true iff a FIR was hit
/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok
///
template< >
fapi2::ReturnCode bad_fir_bits( const fapi2::Target<fapi2::TARGET_TYPE_DIMM>& i_target, bool& o_fir_error )
{
const auto l_mca = mss::find_target<fapi2::TARGET_TYPE_MCA>(i_target);
return bad_fir_bits(l_mca, o_fir_error);
}

}
}

0 comments on commit 3890040

Please sign in to comment.