Skip to content

Commit

Permalink
Fix restore fail due to restore attempt on empty flash
Browse files Browse the repository at this point in the history
-fix NVDIMM FFDC misalignment
-correct NV_STATUS enum naming

Change-Id: Ib0084f5fb95ce8a93ee5e85a0790878469065acb
CQ:SW473934
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/82775
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M Crowell <dcrowell@us.ibm.com>
  • Loading branch information
Tsung Yeung authored and dcrowell77 committed Aug 27, 2019
1 parent a95709f commit b94854d
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 73 deletions.
6 changes: 4 additions & 2 deletions src/include/usr/isteps/nvdimm/nvdimm.H
Expand Up @@ -34,12 +34,14 @@ enum nvdimm_err_status
{
NSTD_VAL_ERASED = 0x08, // Image erased, SCM device contents not persisted
NSTD_VAL_ERASED_MASK = 0xF7,
NSTD_VAL_ERROR = 0x04, // Valid image successfully restored, SCM persisted
NSTD_VAL_ERROR_MASK = 0xFB,
NSTD_VAL_RESTORED = 0x04, // Valid image successfully restored, SCM persisted
NSTD_VAL_RESTORED_MASK = 0xFB,
NSTD_VAL_SR_FAILED = 0x02, // Save/Restore failed to persist memory contents
NSTD_VAL_SR_FAILED_MASK = 0xFD,
NSTD_VAL_DISARMED = 0x01, // memory unable to preserve future content
NSTD_VAL_DISARMED_MASK = 0xFE,
NSTD_ERR_VAL_SR = 0x40, // Partially working. Error detected but save/restore (SR) may still work
NSTD_ERR_VAL_SR_MASK = 0xBF,
NSTD_ERR = 0x03, // NSTD_ERR_NOPRSV+NSTD_ERR_NOBKUP
};

Expand Down
100 changes: 55 additions & 45 deletions src/usr/isteps/nvdimm/nvdimm.C
Expand Up @@ -381,19 +381,19 @@ void nvdimmSetStatusFlag(Target *i_nvdimm, const uint8_t i_status_flag)

switch(i_status_flag)
{
// Make sure NSTD_VAL_ERROR (content preserved) is unset before setting NSTD_VAL_ERASED
// Make sure NSTD_VAL_RESTORED (content preserved) is unset before setting NSTD_VAL_ERASED
// (data not preserved) or NSTD_VAL_SR_FAILED (error preserving data)
case NSTD_ERR:
case NSTD_VAL_ERASED:
case NSTD_VAL_SR_FAILED:
l_statusFlag &= NSTD_VAL_ERROR_MASK;
l_statusFlag &= NSTD_VAL_RESTORED_MASK;
l_statusFlag |= i_status_flag;
break;

// If the content preserved(restore sucessfully), make sure
// NSTD_VAL_ERASED (not preserved) and NSTD_VAL_SR_FAILED (error preserving)
// are unset before setting this flag.
case NSTD_VAL_ERROR:
case NSTD_VAL_RESTORED:
l_statusFlag &= (NSTD_VAL_ERASED_MASK & NSTD_VAL_SR_FAILED_MASK);
l_statusFlag |= i_status_flag;
break;
Expand All @@ -402,6 +402,11 @@ void nvdimmSetStatusFlag(Target *i_nvdimm, const uint8_t i_status_flag)
l_statusFlag |= i_status_flag;
break;

// Error detected but save/restore might work. May coexsit with other bits.
case NSTD_ERR_VAL_SR:
l_statusFlag |= i_status_flag;
break;

default:
assert(0, "nvdimmSetStatusFlag() HUID[%X], i_status_flag[%X] invalid flag!",
get_huid(i_nvdimm), i_status_flag);
Expand Down Expand Up @@ -1165,12 +1170,6 @@ errlHndl_t nvdimmRestore(TargetHandleList& i_nvdimmList, uint8_t &i_mpipl)
break;
}

// Nothing to do. Move on.
if (i_nvdimmList.empty())
{
break;
}

// Kick off the restore on each nvdimm in the nvdimm list
for (const auto & l_nvdimm : i_nvdimmList)
{
Expand Down Expand Up @@ -1647,36 +1646,12 @@ errlHndl_t nvdimm_restore(TargetHandleList &i_nvdimmList)
assert(l_sys, "nvdimm_restore: no TopLevelTarget");
uint8_t l_mpipl = l_sys->getAttr<ATTR_IS_MPIPL_HB>();
nvdimm_reg_t l_RegInfo = nvdimm_reg_t();
TargetHandleList l_nvdimmList = i_nvdimmList;
TargetHandleList l_nvdimm_restore_list = i_nvdimmList;
uint8_t l_rstrValid;

do
{
for (const auto & l_nvdimm : i_nvdimmList)
{
// Check for a valid image
l_err = nvdimmValidImage( l_nvdimm, l_valid );
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] restore failed to read the image", get_huid(l_nvdimm));
errlCommit(l_err, NVDIMM_COMP_ID);
}

if (!l_valid)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] restore failed due to invalid image", get_huid(l_nvdimm));
// Set ATTR NV STATUS FLAG to Erased
nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERASED);
break;
}

}

if (!l_valid)
{
break;
}

// Check MPIPL case first to make sure any on-going backup is complete
if (l_mpipl)
{
// During MPIPL, make sure any in-progress save is completed before proceeding
Expand All @@ -1697,15 +1672,50 @@ errlHndl_t nvdimm_restore(TargetHandleList &i_nvdimmList)
}
}

// Compile a list of nvdimms with valid image
// TODO: Reach out to RAS on how to handle odd number of nvdimms
// since we always operate in pairs
for (TargetHandleList::iterator it = l_nvdimm_restore_list.begin();
it != l_nvdimm_restore_list.end();)
{
// Check for a valid image
l_err = nvdimmValidImage( *it, l_valid );
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() nvdimm[%X] Failed to detect valid image", get_huid(*it));
errlCommit(l_err, NVDIMM_COMP_ID);
}

// Remove it from the restore list if there is no valid image
if (!l_valid)
{
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() nvdimm[%X] No valid image discovered", get_huid(*it));
// Set ATTR NV STATUS FLAG to Erased
nvdimmSetStatusFlag(*it, NSTD_VAL_ERASED);
it = l_nvdimm_restore_list.erase(it);

}
else
{
it++;
}
}

// Exit if there is nothing to restore
if (l_nvdimm_restore_list.empty())
{
break;
}

// Start the restore
l_err = nvdimmRestore(l_nvdimmList, l_mpipl);
l_err = nvdimmRestore(l_nvdimm_restore_list, l_mpipl);

// Check if restore completed successfully
if (l_err)
{
const auto l_nvdimm = l_nvdimmList.front();
const auto l_nvdimm = l_nvdimm_restore_list.front();

TRACFCOMP(g_trac_nvdimm, "nvdimm_restore() - Failing nvdimmRestore()");
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() - Failing nvdimmRestore()");
nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_SR_FAILED);

// Invalid restore could be due to dram not in self-refresh
Expand All @@ -1731,7 +1741,7 @@ errlHndl_t nvdimm_restore(TargetHandleList &i_nvdimmList)

if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] failed during health status check", get_huid(l_nvdimm));
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() nvdimm[%X] failed during health status check", get_huid(l_nvdimm));
if (l_exit)
{
errlCommit( l_err, NVDIMM_COMP_ID );
Expand All @@ -1748,15 +1758,15 @@ errlHndl_t nvdimm_restore(TargetHandleList &i_nvdimmList)
l_err = nvdimmGetRestoreValid(l_nvdimm, l_rstrValid);
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmRestore Target[%X] error validating restore status!",
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore Target[%X] error validating restore status!",
get_huid(l_nvdimm));
break;
}

if ((l_rstrValid & RSTR_SUCCESS) == RSTR_SUCCESS)
{
// Restore success!
nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERROR);
nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_RESTORED);
}

}
Expand Down Expand Up @@ -1956,8 +1966,8 @@ errlHndl_t nvdimm_init(Target *i_nvdimm)
}
else if ((l_data & NO_RESET_N) == NO_RESET_N)
{
// Set ATTR_NV_STATUS_FLAG to restored, as data may persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partial working as data may persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmInit() nvdimm[%X]"
"failed to save due to power loss!",get_huid(i_nvdimm));
/*@
Expand Down Expand Up @@ -2063,8 +2073,8 @@ errlHndl_t nvdimm_init(Target *i_nvdimm)
}
else
{
// Set ATTR_NV_STATUS_FLAG to Restored as data might persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partial working as data may persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);
errlCommit(l_err, NVDIMM_COMP_ID);
}
break;
Expand Down
48 changes: 24 additions & 24 deletions src/usr/isteps/nvdimm/nvdimmErrorLog.C
Expand Up @@ -290,8 +290,8 @@ bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Checkout image validity and set dimm status accordingly
if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

// Callout dimm but do not deconfig or gard
o_err->addPartCallout( i_nvdimm,
Expand Down Expand Up @@ -328,8 +328,8 @@ bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Check restore status and set dimm status accordingly
if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

// Callout dimm but do not deconfig or gard
o_err->addPartCallout( i_nvdimm,
Expand Down Expand Up @@ -368,8 +368,8 @@ bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Check arm status and set dimm status accordingly
if(!l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

// Callout dimm but do not deconfig or gard
o_err->addPartCallout( i_nvdimm,
Expand Down Expand Up @@ -404,8 +404,8 @@ bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::NV_CONTROLLER_PART_TYPE,
HWAS::SRCI_PRIORITY_LOW);

// Set ATTR_NV_STATUS_FLAG to restored as data may persist despite errors
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may persist despite errors
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

break;
}
Expand Down Expand Up @@ -457,8 +457,8 @@ bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Check image validity and set dimm status accordingly
if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

// Callout dimm but do not deconfig or gard
o_err->addPartCallout( i_nvdimm,
Expand Down Expand Up @@ -503,8 +503,8 @@ bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Check restore status and set dimm status accordingly
if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

// Callout dimm but do not deconfig or gard
o_err->addPartCallout( i_nvdimm,
Expand Down Expand Up @@ -549,8 +549,8 @@ bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Check arm status and set dimm status accordingly
if(!l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

// Callout dimm but do not deconfig or gard
o_err->addPartCallout( i_nvdimm,
Expand Down Expand Up @@ -590,8 +590,8 @@ bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::NV_CONTROLLER_PART_TYPE,
HWAS::SRCI_PRIORITY_LOW);

// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

break;
}
Expand Down Expand Up @@ -635,8 +635,8 @@ bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::NV_CONTROLLER_PART_TYPE,
HWAS::SRCI_PRIORITY_LOW);

// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

break;
}
Expand All @@ -654,8 +654,8 @@ bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::NV_CONTROLLER_PART_TYPE,
HWAS::SRCI_PRIORITY_LOW);

// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

break;
}
Expand Down Expand Up @@ -687,8 +687,8 @@ bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
// Check arm status and set dimm status accordingly
if(!l_continue)
{
// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);
}
else
{
Expand Down Expand Up @@ -716,8 +716,8 @@ bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::NV_CONTROLLER_PART_TYPE,
HWAS::SRCI_PRIORITY_LOW);

// Set ATTR_NV_STATUS_FLAG to restored as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR);
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_VAL_SR);

break;
}
Expand Down
6 changes: 6 additions & 0 deletions src/usr/isteps/nvdimm/plugins/errludP_nvdimm.H
Expand Up @@ -206,6 +206,8 @@ public:
// 1 byte : CSAVE_INFO
// 1 byte : CSAVE_FAIL_INFO0
// 1 byte : CSAVE_FAIL_INFO1
// 1 byte : CSAVE_TIMEOUT_INFO0
// 1 byte : CSAVE_TIMEOUT_INFO1
// 1 byte : ERROR_THRESHOLD_STATUS
// 1 byte : NVDIMM_READY
// 1 byte : NVDIMM_CMD_STATUS0
Expand Down Expand Up @@ -238,6 +240,10 @@ public:
++l_databuf;
i_parser.PrintNumber("CSave Fail Info1 Register: ","%.2lX",TO_UINT8(l_databuf));
++l_databuf;
i_parser.PrintNumber("CSave Timeout Info0 Register: ","%.2lX",TO_UINT8(l_databuf));
++l_databuf;
i_parser.PrintNumber("CSave Timeout Info1 Register: ","%.2lX",TO_UINT8(l_databuf));
++l_databuf;
i_parser.PrintNumber("Error Threshold Status Register: ","%.2lX",TO_UINT8(l_databuf));
++l_databuf;
i_parser.PrintNumber("NVDIMM Ready Register: ","%.2lX",TO_UINT8(l_databuf));
Expand Down
4 changes: 2 additions & 2 deletions src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
Expand Up @@ -384,8 +384,8 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set event notification", get_huid(l_nvdimm));

// Set NVDIMM Status flag to Restored, as error detected but data might persist
nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERROR);
// Set NVDIMM Status flag to partial working, as error detected but data might persist
nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_VAL_SR);

/*@
*@errortype
Expand Down

0 comments on commit b94854d

Please sign in to comment.