Skip to content

Commit

Permalink
Add nvdimm arm retry logic in case of glitches
Browse files Browse the repository at this point in the history
Random BPM glitches could cause temporary persistency lost and
failing the arm command. Allow arm retry if glitch only persisted
for a second or less.

Change-Id: I28e65b05e482129f6fea34580064a825923aaaf3
CQ:SW477211
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84668
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Daniel M Crowell <dcrowell@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Dean Sanner <dsanner@us.ibm.com>
  • Loading branch information
Tsung Yeung authored and sannerd committed Oct 4, 2019
1 parent 3920d16 commit 34f119b
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 88 deletions.
1 change: 1 addition & 0 deletions src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ enum nvdimmReasonCode
NVDIMM_ERASE_ERROR = NVDIMM_COMP_ID | 0x51,
NVDIMM_ARM_PRE_CHECK_FAILED = NVDIMM_COMP_ID | 0x52,
NVDIMM_ARM_ENCRYPTION_UNLOCK_FAILED = NVDIMM_COMP_ID | 0x53,
NVDIMM_ARM_RETRY = NVDIMM_COMP_ID | 0x54,
};

enum UserDetailsTypes
Expand Down
281 changes: 193 additions & 88 deletions src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include <trace/interface.H>
#include <errl/errlentry.H>
#include <errl/errlmanager.H>
#include <errl/errludstring.H>
#include <util/runtime/rt_fwreq_helper.H>
#include <targeting/common/attributes.H>
#include <targeting/common/commontargeting.H>
Expand All @@ -50,6 +51,7 @@
#include "../nvdimmErrorLog.H"
#include <isteps/nvdimm/nvdimm.H> // implements some of these
#include "../nvdimm.H" // for g_trac_nvdimm
#include <sys/time.h>

//#define TRACUCOMP(args...) TRACFCOMP(args)
#define TRACUCOMP(args...)
Expand All @@ -63,6 +65,7 @@ namespace NVDIMM
static constexpr uint64_t DARN_ERROR_CODE = 0xFFFFFFFFFFFFFFFFull;
static constexpr uint32_t MAX_DARN_ERRORS = 10;
static constexpr uint8_t FW_OPS_UPDATE = 0x04;
static constexpr size_t ARM_MAX_RETRY_COUNT = 1;
/**
* @brief This function polls the command status register for arm completion
* (does not indicate success or fail)
Expand Down Expand Up @@ -234,9 +237,8 @@ errlHndl_t nvdimmArmPreCheck(Target* i_nvdimm)
*@userdata1[48:56] l_ready
*@userdata1[57:63] l_fwupdate
*@userdata2 <UNUSED>
*@devdesc NVDIMM threw an error or failed to set event
* notifications during arming
*@custdesc NVDIMM failed to enable event notificaitons
*@devdesc NVDIMM failed arm precheck. Refer to FFDC for exact reason
*@custdesc NVDIMM failed the arm precheck and is unable to arm
*/
l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
NVDIMM_ARM_PRE_CHECK,
Expand Down Expand Up @@ -422,112 +424,215 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
break;
}

l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER);
// If we run into any error here we will just
// commit the error log and move on. Let the
// system continue to boot and let the user
// salvage the data
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm));

nvdimmDisarm(i_nvdimmTargetList);

// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
l_err->collectTrace(NVDIMM_COMP_NAME);
errlCommit( l_err, NVDIMM_COMP_ID );
o_arm_successful = false;
continue;
}

// Arm happens one module at a time. No need to set any offset on the counter
uint32_t l_poll = 0;
l_err = nvdimmPollArmDone(l_nvdimm, l_poll);
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm));
l_arm_timeout = true;

l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
if (l_err_t)
{
errlCommit( l_err_t, NVDIMM_COMP_ID );
}

// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
l_err->collectTrace(NVDIMM_COMP_NAME);
bool l_is_retryable = true;
//continue flag set by the retry loop to continue on the outer loop
bool l_continue_arm = false;
//break flag set by the retry loop to break on the outer loop
bool l_break = false;
errlHndl_t l_err_retry = nullptr;

errlCommit( l_err, NVDIMM_COMP_ID );
o_arm_successful = false;
}

// Pass l_arm_timeout value in for health status check
l_continue = l_arm_timeout;

// Check health status registers and exit if required
l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue );

// Check for health status failure
if (l_err)
// Attempt arm multiple times in case of glitches
for (size_t l_retry = 0; l_retry <= ARM_MAX_RETRY_COUNT; l_retry++)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm));

// The arm timeout variable is used here as the continue variable for the
// health status check. This was done to include the timeout for use in the check
// If true either the arm timed out with a health status fail or the
// health status check failed with another disarm and exit condition
if (!l_continue)
l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER);
// If we run into any error here we will just
// commit the error log and move on. Let the
// system continue to boot and let the user
// salvage the data
if (l_err)
{
errlCommit( l_err, NVDIMM_COMP_ID );
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm));

// Disarming all dimms due to error
nvdimmDisarm(i_nvdimmTargetList);

// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
l_err->collectTrace(NVDIMM_COMP_NAME);
errlCommit( l_err, NVDIMM_COMP_ID );
o_arm_successful = false;

// Cause the main loop to skip the rest of the arm procedure
// and move to the next target
l_continue_arm = true;
break;
}
else

// Arm happens one module at a time. No need to set any offset on the counter
uint32_t l_poll = 0;
l_err = nvdimmPollArmDone(l_nvdimm, l_poll);
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm));
l_arm_timeout = true;

l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
if (l_err_t)
{
errlCommit( l_err_t, NVDIMM_COMP_ID );
}

// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
l_err->collectTrace(NVDIMM_COMP_NAME);

errlCommit( l_err, NVDIMM_COMP_ID );
continue;
o_arm_successful = false;
}
}

l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout);
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm", get_huid(l_nvdimm));
// Pass l_arm_timeout value in for health status check
l_continue = l_arm_timeout;

// Disarming all dimms due to error
nvdimmDisarm(i_nvdimmTargetList);
// Sleep for 1 second before checking the health status
// to let the glitches settle in case there were any
nanosleep(1, 0);

// Check health status registers and exit if required
l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue );

l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
if (l_err_t)
// Check for health status failure
// Any fail picked up by the health check is a legit fail
if (l_err)
{
errlCommit( l_err_t, NVDIMM_COMP_ID );
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm));

// The arm timeout variable is used here as the continue variable for the
// health status check. This was done to include the timeout for use in the check
// If true either the arm timed out with a health status fail or the
// health status check failed with another disarm and exit condition
if (!l_continue)
{
errlCommit( l_err, NVDIMM_COMP_ID );

// Disarming all dimms due to error
nvdimmDisarm(i_nvdimmTargetList);
o_arm_successful = false;

// Cause the main loop to exit out of the main arm procedure
l_break = true;
break;
}
else
{
errlCommit( l_err, NVDIMM_COMP_ID );

// Cause the main loop to skip the rest of the arm procedure
// and move to the next target
l_continue_arm = true;
break;
}
}

// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
l_err->collectTrace(NVDIMM_COMP_NAME);

// Dump Traces for error logs
nvdimmTraceRegs( l_nvdimm, l_RegInfo );
nvdimmAddPage4Regs(l_nvdimm,l_err);
l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout);

// Add reg traces to the error log
NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
// At this point we have passed the health check. If the arm were
// to fail now, it is likely it was due to some glitch. Let's retry
// the arm again as long as the fail is not due to timeout.
// A timeout would mean a charging issue, it would have been caught
// by the health check.
l_is_retryable = !l_arm_timeout && l_retry < ARM_MAX_RETRY_COUNT;
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm. %s retryable.",
get_huid(l_nvdimm), l_is_retryable? "IS" : "NOT");

if (l_is_retryable)
{
// Save the original error
l_err_retry = l_err;

/*@
*@errortype
*@reasoncode NVDIMM_ARM_RETRY
*@severity ERRORLOG_SEV_INFORMATIONAL
*@moduleid NVDIMM_ARM_ERASE
*@userdata1[0:31] Target Huid
*@userdata1[32:39] l_is_retryable
*@userdata1[40:47] MAX arm retry count
*@userdata2[0:31] Original errlog plid
*@userdata2[32:63] Original errlog reason code
*@devdesc NVDIMM encountered a glitch causing the initial
* arm to fail. System firmware will retry the arm
*@custdesc NVDIMM requires an arm retry
*/
l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_INFORMATIONAL,
NVDIMM_ARM_ERASE,
NVDIMM_ARM_RETRY,
NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(l_nvdimm),
FOUR_UINT8_TO_UINT32(l_is_retryable, ARM_MAX_RETRY_COUNT,0,0)),
TWO_UINT32_TO_UINT64(l_err_retry->plid(), l_err_retry->reasonCode()),
ERRORLOG::ErrlEntry::NO_SW_CALLOUT );

l_err->collectTrace( NVDIMM_COMP_NAME );

// Callout the dimm
l_err->addHwCallout( l_nvdimm,
HWAS::SRCI_PRIORITY_LOW,
HWAS::NO_DECONFIG,
HWAS::GARD_NULL);

errlCommit( l_err, NVDIMM_COMP_ID );
}
else
{
// Handle retryable error
if (l_err_retry)
{
ERRORLOG::ErrlUserDetailsString("Arm RETRY failed").addToLog(l_err_retry);

// Delete the current errlog and use the original errlog for callout
delete l_err;
l_err = l_err_retry;
l_err_retry = nullptr;
}

// Disarming all dimms due to error
nvdimmDisarm(i_nvdimmTargetList);

l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
if (l_err_t)
{
errlCommit( l_err_t, NVDIMM_COMP_ID );
}

// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
l_err->collectTrace(NVDIMM_COMP_NAME);

// Dump Traces for error logs
nvdimmTraceRegs( l_nvdimm, l_RegInfo );
nvdimmAddPage4Regs(l_nvdimm,l_err);

// Add reg traces to the error log
NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);

errlCommit(l_err, NVDIMM_COMP_ID);
o_arm_successful = false;

// Cause the main loop to exit out of the main arm procedure
l_break = true;
break;
}
}
else
{
// Arm worked. Exit the retry loop
break;
} // close nvdimmCheckArmSuccess check
} // close arm retry loop

errlCommit(l_err, NVDIMM_COMP_ID);
o_arm_successful = false;
if (l_continue_arm)
{
continue;
}
else if (l_break)
{
break;
}

Expand Down

0 comments on commit 34f119b

Please sign in to comment.