diff --git a/src/include/usr/isteps/nvdimm/nvdimm.H b/src/include/usr/isteps/nvdimm/nvdimm.H index c1d6b334cce..acd34deda38 100644 --- a/src/include/usr/isteps/nvdimm/nvdimm.H +++ b/src/include/usr/isteps/nvdimm/nvdimm.H @@ -29,17 +29,18 @@ namespace NVDIMM { + enum nvdimm_err_status { - NSTD_VAL_NOPRSV = 0x08, // memory valid, contents not preserved (genesis) - NSTD_VAL_NOPRSV_MASK = 0xF7, - NSTD_VAL_PRSV = 0x04, // memory contents preserved - NSTD_VAL_PRSV_MASK = 0xFB, - NSTD_ERR_NOPRSV = 0x02, // memory failed to preserve contents - NSTD_ERR_NOPRSV_MASK = 0xFD, - NSTD_ERR_NOBKUP = 0x01, // memory unable to preserve future content - NSTD_ERR_NOBKUP_MASK = 0xFE, - NSTD_ERR = 0x03, // NSTD_ERR_NOPRSV+NSTD_ERR_NOBKUP + NSTD_VAL_ERASED = 0x08, // Image erased, SCM device contents not persisted + NSTD_VAL_ERASED_MASK = 0xF7, + NSTD_VAL_ERROR = 0x04, // Valid image successfully restored, SCM persisted + NSTD_VAL_ERROR_MASK = 0xFB, + NSTD_VAL_SR_FAILED = 0x02, // Save/Restore failed to persist memory contents + NSTD_VAL_SR_FAILED_MASK = 0xFD, + NSTD_VAL_DISARMED = 0x01, // memory unable to preserve future content + NSTD_VAL_DISARMED_MASK = 0xFE, + NSTD_ERR = 0x03, // NSTD_ERR_NOPRSV+NSTD_ERR_NOBKUP }; #ifndef __HOSTBOOT_RUNTIME @@ -54,7 +55,7 @@ enum nvdimm_err_status * @param[in] i_nvdimmList - list of nvdimm targets * **/ -void nvdimm_restore(TARGETING::TargetHandleList &i_nvdimmList); +errlHndl_t nvdimm_restore(TARGETING::TargetHandleList &i_nvdimmList); /** @@ -273,7 +274,8 @@ errlHndl_t notifyNvdimmProtectionChange(TARGETING::Target* i_target, * - Disarms the trigger for draminit * @param i_target nvdimm target */ -void nvdimm_init(TARGETING::Target *i_nvdimm); +errlHndl_t nvdimm_init(TARGETING::Target *i_nvdimm); + } #endif // NVDIMM_EXT_H__ diff --git a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H index 8ed928a2d00..da69fb86ea1 100644 --- a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H +++ b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H @@ -99,6 +99,10 @@ enum nvdimmModuleId SEND_ATTR_NVDIMM_ARMED = 0x31, NVDIMM_FACTORY_RESET = 0x32, NVDIMM_HEALTH_CHECK = 0x33, + NVDIMM_CHECK_RESETN = 0x34, + NVDIMM_CHECK_CSAVE = 0x35, + NVDIMM_MODULE_HEALTH_STATUS_CHECK = 0x36, + NVDIMM_SET_EVENT_NOTIFICATION = 0x37, }; /** @@ -151,27 +155,48 @@ enum nvdimmReasonCode NVDIMM_UPDATE_NOT_SUPPORTED = NVDIMM_COMP_ID | 0x27, // NV controller cannot be updated NVDIMM_START_UPDATE = NVDIMM_COMP_ID | 0x28, // start update NVDIMM_UPDATE_COMPLETE = NVDIMM_COMP_ID | 0x29, // update completed - NVDIMM_TPM_NOT_FOUND = NVDIMM_COMP_ID | 0x30, // TPM not found - NVDIMM_VERIF_BYTE_CHECK_FAILED = NVDIMM_COMP_ID | 0x31, // Encryption key reg verif failed - NVDIMM_ENCRYPTION_ENABLE_FAILED = NVDIMM_COMP_ID | 0x32, // Encryption enable failed - NVDIMM_ENCRYPTION_ERASE_PENDING_FAILED = NVDIMM_COMP_ID | 0x32, // Encryption crypto erase pending failed - NVDIMM_ENCRYPTION_ERASE_FAILED = NVDIMM_COMP_ID | 0x33, // Encryption crypto erase failed - NVDIMM_ENCRYPTION_UNLOCK_FAILED = NVDIMM_COMP_ID | 0x34, // Encryption unlock failed - NVDIMM_ENCRYPTION_INVALID_ATTRIBUTE = NVDIMM_COMP_ID | 0x35, // Encryption attribute key data invalid - NVDIMM_ENCRYPTION_KEY_ATTRS_INVALID = NVDIMM_COMP_ID | 0x36, // Encryption key attributes are both invalid - NVDIMM_ENCRYPTION_MAX_DARN_ERRORS = NVDIMM_COMP_ID | 0x37, // Darn random key gen reached max errors - NVDIMM_ENCRYPTION_BAD_RANDOM_DATA = NVDIMM_COMP_ID | 0x38, // Generated key data not valid - NVDIMM_CANNOT_MAKE_ATTRIBUTE = NVDIMM_COMP_ID | 0x39, // Cannot make Attribute - NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE = NVDIMM_COMP_ID | 0x3A, // !< pertains to ES_CMD_STATUS0[0]; the health check in progress flag - NVDIMM_HEALTH_CHECK_REPORTED_FAILURE = NVDIMM_COMP_ID | 0x3B, // !< pertains to ES_CMD_STATUS0[2]; the health check reported a failure flag - NVDIMM_LIFETIME_MIN_REQ_NOT_MET = NVDIMM_COMP_ID | 0x3C, // !< pertains to ES_LIFETIME; BPM does not meet minimum requirement for a new BPM - NVDIMM_HEALTH_CHECK_NEVER_INITIATED = NVDIMM_COMP_ID | 0x3D, // !< A health check was never initiated at start of IPL + NVDIMM_TPM_NOT_FOUND = NVDIMM_COMP_ID | 0x2A, // TPM not found + NVDIMM_POWER_SAVE_FAILURE = NVDIMM_COMP_ID | 0x2B, // Save failed due to power loss + NVDIMM_CSAVE_ERROR = NVDIMM_COMP_ID | 0x2C, // CSave failed due to error + NVDIMM_VOLTAGE_REGULATOR_FAILED = NVDIMM_COMP_ID | 0x2D, + NVDIMM_VDD_LOST = NVDIMM_COMP_ID | 0x2E, + NVDIMM_VPP_LOST = NVDIMM_COMP_ID | 0x2F, + NVDIMM_VTT_LOST = NVDIMM_COMP_ID | 0x30, + NVDIMM_DRAM_NOT_SELF_REFRESH = NVDIMM_COMP_ID | 0x31, + NVDIMM_CONTROLLER_HARDWARE_ERROR = NVDIMM_COMP_ID | 0x32, + NVDIMM_NVM_CONTROLLER_ERROR = NVDIMM_COMP_ID | 0x33, + NVDIMM_NVM_LIFETIME_ERROR = NVDIMM_COMP_ID | 0x34, + NVDIMM_NOT_ENOUGH_ENERGY_FOR_CSAVE = NVDIMM_COMP_ID | 0x35, + NVDIMM_INVALID_FIRMWARE_ERROR = NVDIMM_COMP_ID | 0x36, // Module Health Status Registers + NVDIMM_CONFIG_DATA_ERROR = NVDIMM_COMP_ID | 0x37, + NVDIMM_NO_ES_PRESENT = NVDIMM_COMP_ID | 0x38, + NVDIMM_ES_POLICY_NOT_SET = NVDIMM_COMP_ID | 0x39, + NVDIMM_ES_HARDWARE_FAILURE = NVDIMM_COMP_ID | 0x3A, + NVDIMM_ES_HEALTH_ASSESSMENT_ERROR = NVDIMM_COMP_ID | 0x3B, + NVDIMM_ES_LIFETIME_ERROR = NVDIMM_COMP_ID | 0x3C, + NVDIMM_ES_TEMP_ERROR = NVDIMM_COMP_ID | 0x3D, + NVDIMM_SET_EVENT_NOTIFICATION_ERROR = NVDIMM_COMP_ID | 0x3E, + NVDIMM_VERIF_BYTE_CHECK_FAILED = NVDIMM_COMP_ID | 0x3F, // Encryption key reg verif failed + NVDIMM_ENCRYPTION_ENABLE_FAILED = NVDIMM_COMP_ID | 0x40, // Encryption enable failed + NVDIMM_ENCRYPTION_ERASE_PENDING_FAILED = NVDIMM_COMP_ID | 0x41, // Encryption crypto erase pending failed + NVDIMM_ENCRYPTION_ERASE_FAILED = NVDIMM_COMP_ID | 0x42, // Encryption crypto erase failed + NVDIMM_ENCRYPTION_UNLOCK_FAILED = NVDIMM_COMP_ID | 0x43, // Encryption unlock failed + NVDIMM_ENCRYPTION_INVALID_ATTRIBUTE = NVDIMM_COMP_ID | 0x44, // Encryption attribute key data invalid + NVDIMM_ENCRYPTION_KEY_ATTRS_INVALID = NVDIMM_COMP_ID | 0x45, // Encryption key attributes are both invalid + NVDIMM_ENCRYPTION_MAX_DARN_ERRORS = NVDIMM_COMP_ID | 0x46, // Darn random key gen reached max errors + NVDIMM_ENCRYPTION_BAD_RANDOM_DATA = NVDIMM_COMP_ID | 0x47, // Generated key data not valid + NVDIMM_CANNOT_MAKE_ATTRIBUTE = NVDIMM_COMP_ID | 0x48, // Cannot make Attribute + NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE = NVDIMM_COMP_ID | 0x49, // !< pertains to ES_CMD_STATUS0[0]; the health check in progress flag + NVDIMM_HEALTH_CHECK_REPORTED_FAILURE = NVDIMM_COMP_ID | 0x4A, // !< pertains to ES_CMD_STATUS0[2]; the health check reported a failure flag + NVDIMM_LIFETIME_MIN_REQ_NOT_MET = NVDIMM_COMP_ID | 0x4B, // !< pertains to ES_LIFETIME; BPM does not meet minimum requirement for a new BPM + NVDIMM_HEALTH_CHECK_NEVER_INITIATED = NVDIMM_COMP_ID | 0x4C, // !< A health check was never initiated at start of IPL }; enum UserDetailsTypes { NVDIMM_UDT_NO_FORMAT = 0x0, NVDIMM_UDT_PARAMETERS = 0x1, + NVDIMM_OP_PARAMETERS = 0x2, }; }; // end NVDIMM diff --git a/src/usr/isteps/nvdimm/errlud_nvdimm.C b/src/usr/isteps/nvdimm/errlud_nvdimm.C index 743297b94d7..9f916e04058 100644 --- a/src/usr/isteps/nvdimm/errlud_nvdimm.C +++ b/src/usr/isteps/nvdimm/errlud_nvdimm.C @@ -158,9 +158,46 @@ UdNvdimmParms::UdNvdimmParms( uint8_t i_opType, } //------------------------------------------------------------------------------ -UdNvdimmParms::~UdNvdimmParms() -{ +UdNvdimmParms::~UdNvdimmParms() = default; +//------------------------------------------------------------------------------ +// NVDIMM Dimm Operation Parameters and Errors +//------------------------------------------------------------------------------ +UdNvdimmOPParms::UdNvdimmOPParms( const nvdimm_reg_t &i_RegInfo ) +{ + // Version control for ErrorUD struct + iv_CompId = NVDIMM_COMP_ID; + iv_Version = 3; + iv_SubSection = NVDIMM_OP_PARAMETERS; + + //***** Memory Layout ***** + // 1 byte : MODULE_HEALTH + // 1 byte : MODULE_HEALTH_STATUS0 + // 1 byte : MODULE_HEALTH_STATUS1 + // 1 byte : CSAVE_STATUS + // 1 byte : CSAVE_INFO + // 1 byte : CSAVE_FAIL_INFO0 + // 1 byte : CSAVE_FAIL_INFO1 + // 1 byte : ERROR_THRESHOLD_STATUS + // 1 byte : NVDIMM_READY + // 1 byte : NVDIMM_CMD_STATUS0 + // 1 byte : ABORT_CMD_TIMEOUT + // 1 byte : ERASE_STATUS + // 1 byte : ERASE_TIMEOUT0 + // 1 byte : ERASE_TIMEOUT1 + // 1 byte : SET_ES_POLICY_STATUS + // 1 byte : RESTORE_STATUS + // 1 byte : RESTORE_FAIL_INFO + // 1 byte : RESTORE_TIMEOUT0 + // 1 byte : RESTORE_TIMEOUT1 + // 1 byte : ARM_STATUS + // 1 byte : SET_EVENT_NOTIFICATION_STATUS + + char * l_pBuf = reinterpret_cast( reallocUsrBuf(sizeof(i_RegInfo))); + memcpy(l_pBuf, &i_RegInfo, sizeof(i_RegInfo)); } +// Default the deconstructor +UdNvdimmOPParms::~UdNvdimmOPParms() = default; + } // end NVDIMM namespace diff --git a/src/usr/isteps/nvdimm/errlud_nvdimm.H b/src/usr/isteps/nvdimm/errlud_nvdimm.H index 55b5f9b20fa..2041da054de 100644 --- a/src/usr/isteps/nvdimm/errlud_nvdimm.H +++ b/src/usr/isteps/nvdimm/errlud_nvdimm.H @@ -61,12 +61,37 @@ class UdNvdimmParms : public ERRORLOG::ErrlUserDetails */ virtual ~UdNvdimmParms(); - private: // Disabled - UdNvdimmParms(UdNvdimmParms &); - UdNvdimmParms & operator=(UdNvdimmParms &); + UdNvdimmParms(UdNvdimmParms &) = delete; + UdNvdimmParms & operator=(UdNvdimmParms &) = delete; }; -} // end NVDIMM namespace +/** + * @class UdNvdimmOPParms + * + * Adds NVDIMM information to an error log as user detail data + */ +class UdNvdimmOPParms : public ERRORLOG::ErrlUserDetails +{ + public: + /** + * @brief Constructor + * + * @param i_i2cInfo Miscellaneous Parameters + */ + UdNvdimmOPParms( const nvdimm_reg_t &i_RegInfo ); + + /** + * @brief Destructor + */ + virtual ~UdNvdimmOPParms(); + + // Disabled + UdNvdimmOPParms() = delete; + UdNvdimmOPParms(UdNvdimmOPParms &) = delete; + UdNvdimmOPParms & operator=(UdNvdimmOPParms &) = delete; +}; + +} // end of namespace NVDIMM #endif diff --git a/src/usr/isteps/nvdimm/nvdimm.C b/src/usr/isteps/nvdimm/nvdimm.C index db26eb18414..3e0d712ffae 100644 --- a/src/usr/isteps/nvdimm/nvdimm.C +++ b/src/usr/isteps/nvdimm/nvdimm.C @@ -40,6 +40,8 @@ #include #include #include +#include "errlud_nvdimm.H" +#include "nvdimmErrorLog.H" #include #include #include @@ -54,6 +56,7 @@ using namespace TARGETING; using namespace DeviceFW; using namespace EEPROM; +using namespace ERRORLOG; trace_desc_t* g_trac_nvdimm = NULL; TRAC_INIT(&g_trac_nvdimm, NVDIMM_COMP_NAME, 2*KILOBYTE); @@ -360,24 +363,24 @@ void nvdimmSetStatusFlag(Target *i_nvdimm, const uint8_t i_status_flag) switch(i_status_flag) { - // Make sure NSTD_VAL_PRSV (content preserved) is unset before setting NSTD_VAL_NOPRSV - // (data not preserved) or NSTD_ERR_NOPRSV (error preserving data) + // Make sure NSTD_VAL_ERROR (content preserved) is unset before setting NSTD_VAL_ERASED + // (data not preserved) or NSTD_VAL_SR_FAILED (error preserving data) case NSTD_ERR: - case NSTD_VAL_NOPRSV: - case NSTD_ERR_NOPRSV: - l_statusFlag &= NSTD_VAL_PRSV_MASK; + case NSTD_VAL_ERASED: + case NSTD_VAL_SR_FAILED: + l_statusFlag &= NSTD_VAL_ERROR_MASK; l_statusFlag |= i_status_flag; break; // If the content preserved(restore sucessfully), make sure - // NSTD_VAL_NOPRSV (not preserved) and NSTD_ERR_NOPRSV (error preserving) + // NSTD_VAL_ERASED (not preserved) and NSTD_VAL_SR_FAILED (error preserving) // are unset before setting this flag. - case NSTD_VAL_PRSV: - l_statusFlag &= (NSTD_VAL_NOPRSV_MASK & NSTD_ERR_NOPRSV_MASK); + case NSTD_VAL_ERROR: + l_statusFlag &= (NSTD_VAL_ERASED_MASK & NSTD_VAL_SR_FAILED_MASK); l_statusFlag |= i_status_flag; break; - case NSTD_ERR_NOBKUP: + case NSTD_VAL_DISARMED: l_statusFlag |= i_status_flag; break; @@ -407,7 +410,8 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmReady() HUID[%X]",get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; - uint8_t l_data = 0x0; + nvdimm_reg_t l_RegInfo; + uint8_t l_data; uint8_t l_nvm_init_time = 0; size_t l_numBytes = 1; @@ -456,6 +460,48 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) if ((l_data != NV_READY) && !l_err) { + + // Collect available status registers for error log + do + { + // Read and save NVDIMM_READY for traces + l_err = nvdimmReadReg(i_nvdimm, NVDIMM_READY, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.NVDimm_Ready = l_data; + + // Read and save MODULE_HEALTH for traces + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.Module_Health = l_data; + + // Read and save MODULE_HEALTH_STATUS0 for traces + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.Module_Health_Status0 = l_data; + + // Read and save MODULE_HEALTH_STATUS1 for traces + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.Module_Health_Status1 = l_data; + + }while(0); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmReady() nvdimm[%X] - nvdimm not ready[%d]", get_huid(i_nvdimm), l_data); /*@ @@ -484,7 +530,12 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) // a failing indication on the NV controller l_err->addPartCallout( i_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + + // Add Register Traces to error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); } }while(0); @@ -616,12 +667,6 @@ errlHndl_t nvdimmPollStatus ( Target *i_nvdimm, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); - - // May have to move the error handling to the caller - // as different op could have different error severity - l_err->addPartCallout( i_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); } return l_err; @@ -646,9 +691,39 @@ errlHndl_t nvdimmPollBackupDone(Target* i_nvdimm, get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); l_err = nvdimmPollStatus ( i_nvdimm, SAVE, o_poll); + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + + /*@ + *@errortype + *@reasoncode NVDIMM_BACKUP_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_POLL_BACKUP + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@devdesc Encountered timeout while performing NVDIMM Restore operation + *@custdesc NVDIMM timed out + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_POLL_BACKUP, + NVDIMM_BACKUP_TIMEOUT, + NVDIMM_SET_USER_DATA_1(SAVE, TARGETING::get_huid(i_nvdimm)), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + } + TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollBackupDone() nvdimm[%X]", get_huid(i_nvdimm)); @@ -673,15 +748,52 @@ errlHndl_t nvdimmPollRestoreDone(Target* i_nvdimm, get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); l_err = nvdimmPollStatus ( i_nvdimm, RESTORE, o_poll ); + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + + /*@ + *@errortype + *@reasoncode NVDIMM_RESTORE_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_POLL_RESTORE + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@devdesc Encountered timeout while performing NVDIMM Restore operation + *@custdesc NVDIMM timed out + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_POLL_RESTORE, + NVDIMM_RESTORE_TIMEOUT, + NVDIMM_SET_USER_DATA_1(RESTORE, TARGETING::get_huid(i_nvdimm)), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // May have to move the error handling to the caller + // as different op could have different error severity + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + } + TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollRestoreDone() nvdimm[%X]", get_huid(i_nvdimm)); return l_err; } + /** * @brief This function polls the command status register for erase * completion (does not indicate success or fail) @@ -701,7 +813,31 @@ errlHndl_t nvdimmPollEraseDone(Target* i_nvdimm, errlHndl_t l_err = nullptr; - l_err = nvdimmPollStatus ( i_nvdimm, ERASE, o_poll); + l_err = nvdimmPollStatus( i_nvdimm, ERASE, o_poll); + + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + + /*@ + *@errortype + *@reasoncode NVDIMM_ERASE_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_POLL_ERASE + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@devdesc Encountered timeout while performing NVDIMM Restore operation + *@custdesc NVDIMM timed out + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_POLL_ERASE, + NVDIMM_ERASE_TIMEOUT, + NVDIMM_SET_USER_DATA_1(ERASE, TARGETING::get_huid(i_nvdimm)), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollEraseDone() nvdimm[%X]", get_huid(i_nvdimm)); @@ -729,7 +865,11 @@ errlHndl_t nvdimmPollESChargeStatus(Target* i_nvdimm, errlHndl_t l_err = nullptr; - l_err = nvdimmPollStatus ( i_nvdimm, CHARGE, o_poll ); + l_err = nvdimmPollStatus( i_nvdimm, CHARGE, o_poll ); + + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollESChargeDone() nvdimm[%X]", get_huid(i_nvdimm)); @@ -781,7 +921,8 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; - uint8_t l_data; + uint8_t l_data = 0x0; + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); do { @@ -790,7 +931,7 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOBKUP); + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_DISARMED); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmSetESPolicy() nvdimm[%X]" "failed to write ES register!",get_huid(i_nvdimm)); break; @@ -804,13 +945,13 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOBKUP); + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_DISARMED); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmSetESPolicy() nvdimm[%X]" "failed to read ES register!",get_huid(i_nvdimm)); break; } - if ((l_data & ES_SUCCESS) != ES_SUCCESS) + if (((l_data & ES_SUCCESS) != ES_SUCCESS) || ((l_data & ES_POLICY_ERROR) == ES_POLICY_ERROR)) { TRACFCOMP(g_trac_nvdimm, EXIT_MRK"NDVIMM HUID[%X], nvdimmSetESPolicy() " "failed!",get_huid(i_nvdimm)); @@ -837,14 +978,11 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) l_err->collectTrace(NVDIMM_COMP_NAME); - // Failure setting the energy source policy could mean error on the - // battery or even the cabling - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_CABLE_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); + // Read relevant regs for trace data + nvdimmTraceRegs(i_nvdimm, l_RegInfo); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); } }while(0); @@ -938,12 +1076,12 @@ errlHndl_t nvdimmValidImage(Target *i_nvdimm, bool &o_imgValid) * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ -errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) +errlHndl_t nvdimmRestore(TargetHandleList& i_nvdimmList, uint8_t &i_mpipl) { errlHndl_t l_err = nullptr; - bool l_imgValid; uint8_t l_rstrValid; uint32_t l_poll = 0; + TargetHandleList l_nvdimmList = i_nvdimmList; do { @@ -952,23 +1090,7 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) it != i_nvdimmList.end();) { // Default state during boot is unarmed, therefore not preserved - nvdimmSetStatusFlag(*it, NSTD_ERR_NOBKUP); - - l_err = nvdimmValidImage(*it, l_imgValid); - - // No reason to run if we can't figure out - // if there is an image or not - if (l_err) - { - break; - } - - if (!l_imgValid) - { - nvdimmSetStatusFlag(*it, NSTD_VAL_NOPRSV); - i_nvdimmList.erase(it); - continue; - } + nvdimmSetStatusFlag(*it, NSTD_VAL_DISARMED); TargetHandleList l_mcaList; getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA); @@ -987,13 +1109,6 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore() HUID[%X] i_mpipl[%u] failed to de-assert resetn!", get_huid(*it), i_mpipl); - - nvdimmSetStatusFlag(*it, NSTD_ERR_NOPRSV); - //@TODO RTC 199645 - add HW callout on dimm target - // If we failed to de-assert reset_n, the dimm is pretty much useless. - // Let's not restore if that happens - // The callout will be added inside the HWP - // Leaving this comment here as a reminder, will remove later break; } @@ -1009,7 +1124,7 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) get_huid(*it), i_mpipl); l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); - l_err->collectTrace(NVDIMM_COMP_NAME, 256); + l_err->collectTrace( NVDIMM_COMP_NAME ); ERRORLOG::errlCommit(l_err, NVDIMM_COMP_ID); } @@ -1022,12 +1137,6 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore() HUID[%X] self_refresh_entry failed!", get_huid(*it)); - - nvdimmSetStatusFlag(*it, NSTD_ERR_NOPRSV); - //@TODO RTC 199645 - add HW callout on dimm target - // Without SRE the data could be not reliably restored - // The callout will be added inside the HWP - // Leaving this comment here as a reminder, will remove later break; } it++; @@ -1050,7 +1159,6 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_FUNC_CMD, RESTORE_IMAGE); if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X], error initiating restore!!", get_huid(l_nvdimm)); break; @@ -1071,10 +1179,8 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) l_err = nvdimmPollRestoreDone(l_nvdimm, l_poll); if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X], error restoring!", get_huid(l_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } } @@ -1084,22 +1190,22 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) break; } - // Make sure the restore is valid + // Check for restore errors for (const auto & l_nvdimm : i_nvdimmList) { l_err = nvdimmGetRestoreValid(l_nvdimm, l_rstrValid); if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore Target[%X] error validating restore status!", get_huid(l_nvdimm)); break; } - if ((l_rstrValid & RSTR_SUCCESS) != RSTR_SUCCESS){ + if ((l_rstrValid & RSTR_ERROR) == RSTR_ERROR) + { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] restoreValid[%d], restore failed!", - get_huid(l_nvdimm), l_rstrValid); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] restore failed due to errors", + get_huid(l_nvdimm)); /*@ *@errortype *@reasoncode NVDIMM_RESTORE_FAILED @@ -1119,28 +1225,19 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) get_huid(l_nvdimm), 0x0, ERRORLOG::ErrlEntry::NO_SW_CALLOUT); - - l_err->collectTrace(NVDIMM_COMP_NAME); - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); - - // Invalid restore could be due to dram not in self-refresh - // or controller issue. Data should not be trusted at this point - l_err->addPartCallout( l_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); break; } } if (l_err) { + TRACFCOMP(g_trac_nvdimm, "restore encountered an error"); break; } // Exit self-refresh for (const auto & l_nvdimm : i_nvdimmList) { - TargetHandleList l_mcaList; getParentAffinityTargets(l_mcaList, l_nvdimm, CLASS_UNIT, TYPE_MCA); assert(l_mcaList.size(), "nvdimmRestore() failed to find parent MCA."); @@ -1155,21 +1252,25 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore() HUID[%X] post_restore_transition failed!", get_huid(l_nvdimm)); - - // Commit the error from the HWP - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); break; } else { // Restore success! - nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_PRSV); + // Remove dimm from list for error handling + i_nvdimmList.erase(i_nvdimmList.begin()); } } + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() HUID[%X] encounrterd an error during restore"); + break; + } + if (i_mpipl) { - for (const auto & l_nvdimm : i_nvdimmList) + for (const auto & l_nvdimm : l_nvdimmList) { TargetHandleList l_mcaList; errlHndl_t err = nullptr; @@ -1188,7 +1289,7 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) get_huid(l_nvdimm), i_mpipl); err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); - err->collectTrace(NVDIMM_COMP_NAME, 256); + err->collectTrace( NVDIMM_COMP_NAME ); ERRORLOG::errlCommit(err, NVDIMM_COMP_ID); } } @@ -1203,68 +1304,42 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) #endif /** - * @brief This function checks the erase status register to make sure - * the last erase completed witout error + * @brief This function checks the status and success of an erase * * @param[in] i_nvdimm - nvdimm target with NV controller * * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ -errlHndl_t nvdimmCheckEraseSuccess(Target *i_nvdimm) +errlHndl_t nvdimmEraseCheck(Target *i_nvdimm) { - TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckEraseSuccess() : nvdimm[%X]", - get_huid(i_nvdimm)); - - uint8_t l_data = 0; errlHndl_t l_err = nullptr; + nvdimm_reg_t l_RegInfo; - l_err = nvdimmReadReg(i_nvdimm, ERASE_STATUS, l_data); + // Erase happens one module at a time. No need to set any offset on the counter + uint32_t l_poll = 0; + l_err = nvdimmPollEraseDone(i_nvdimm, l_poll); + // Add part callout, currently all erase calls have same callout + // Dump traces to the error log if error exists if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckEraseSuccess() nvdimm[%X]" - "failed to read erase status reg!",get_huid(i_nvdimm)); - } - else if ((l_data & ERASE_SUCCESS) != ERASE_SUCCESS) - { + // For both Erase timeout and Erase fail + // Callout nvdimm on high, gard and deconfig + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckEraseSuccess() nvdimm[%X]" - "failed to erase!",get_huid(i_nvdimm)); - /*@ - *@errortype - *@reasoncode NVDIMM_ERASE_FAILED - *@severity ERRORLOG_SEV_PREDICTIVE - *@moduleid NVDIMM_CHECK_ERASE - *@userdata1[0:31] Related ops (0xff = NA) - *@userdata1[32:63] Target Huid - *@userdata2 - *@devdesc Encountered error erasing previously stored data image - * on NVDIMM. Likely due to timeout and/or controller error - *@custdesc NVDIMM error erasing data image - */ - l_err = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_PREDICTIVE, - NVDIMM_CHECK_ERASE, - NVDIMM_ERASE_FAILED, - NVDIMM_SET_USER_DATA_1(ERASE, get_huid(i_nvdimm)), - 0x0, - ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); - l_err->collectTrace(NVDIMM_COMP_NAME); - errlCommit( l_err, NVDIMM_COMP_ID ); + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); - // Failure to erase could mean internal NV controller error and/or - // HW error on nand flash. NVDIMM will lose persistency if failed to - // erase nand flash - l_err->addPartCallout( i_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); } - TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckEraseSuccess(): nvdimm[%X] ret[%X]", - get_huid(i_nvdimm), l_data); - return l_err; } @@ -1293,13 +1368,8 @@ errlHndl_t nvdimmEraseNF(Target *i_nvdimm) break; } - // Erase happens one module at a time. No need to set any offset on the counter - uint32_t l_poll = 0; - l_err = nvdimmPollEraseDone(i_nvdimm, l_poll); - if (!l_err) - { - l_err = nvdimmCheckEraseSuccess(i_nvdimm); - } + // Poll for success and check status + l_err = nvdimmEraseCheck(i_nvdimm); }while(0); @@ -1525,7 +1595,7 @@ errlHndl_t nvdimmEpowSetup(TargetHandleList &i_nvdimmList) TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmEpowSetup() HUID[%X] failed to setup epow!", get_huid(*it)); - nvdimmSetStatusFlag(*it, NSTD_ERR_NOPRSV); + nvdimmSetStatusFlag(*it, NSTD_VAL_SR_FAILED); break; } it++; @@ -1547,32 +1617,46 @@ errlHndl_t nvdimmEpowSetup(TargetHandleList &i_nvdimmList) * @param[in] i_nvdimmList - list of nvdimm targets * */ -void nvdimm_restore(TargetHandleList &i_nvdimmList) +errlHndl_t nvdimm_restore(TargetHandleList &i_nvdimmList) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimm_restore()"); + errlHndl_t l_err = nullptr; - Target* l_sys = nullptr; - targetService().getTopLevelTarget( l_sys ); + bool l_valid = false; + bool l_exit = false; + TARGETING::Target* l_sys = nullptr; + TARGETING::targetService().getTopLevelTarget( l_sys ); assert(l_sys, "nvdimm_restore: no TopLevelTarget"); uint8_t l_mpipl = l_sys->getAttr(); + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); + TargetHandleList l_nvdimmList = i_nvdimmList; + uint8_t l_rstrValid; do { - // Set the energy policy to device-managed - // Don't think this is needed for the supercaps to start charging - // but do it anyway to get the charging going for (const auto & l_nvdimm : i_nvdimmList) { - l_err = nvdimmSetESPolicy(l_nvdimm); + // Check for a valid image + l_err = nvdimmValidImage( l_nvdimm, l_valid ); if (l_err) { - // Failing this is an indication of power pack issue. - // This will prevent future backup, but let's continue - // since we can still restore the data if there is any - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() - Failing nvdimmSetESPolicy()"); - errlCommit( l_err, NVDIMM_COMP_ID ); + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] restore failed to read the image", get_huid(l_nvdimm)); + errlCommit(l_err, NVDIMM_COMP_ID); } + + if (!l_valid) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] restore failed due to invalid image", get_huid(l_nvdimm)); + // Set ATTR NV STATUS FLAG to Erased + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERASED); + break; + } + + } + + if (!l_valid) + { + break; } if (l_mpipl) @@ -1586,7 +1670,7 @@ void nvdimm_restore(TargetHandleList &i_nvdimmList) if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERASED); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() nvdimm[%X], error backing up the DRAM!", get_huid(l_nvdimm)); errlCommit(l_err, NVDIMM_COMP_ID); @@ -1596,31 +1680,77 @@ void nvdimm_restore(TargetHandleList &i_nvdimmList) } // Start the restore - l_err = nvdimmRestore(i_nvdimmList, l_mpipl); + l_err = nvdimmRestore(l_nvdimmList, l_mpipl); + // Check if restore completed successfully if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() - Failing nvdimmRestore()"); - errlCommit( l_err, NVDIMM_COMP_ID ); + const auto l_nvdimm = l_nvdimmList.front(); + + TRACFCOMP(g_trac_nvdimm, "nvdimm_restore() - Failing nvdimmRestore()"); + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_SR_FAILED); + + // Invalid restore could be due to dram not in self-refresh + // or controller issue. Data should not be trusted at this point + l_err->addPartCallout( l_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( l_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); break; } - // Make sure the energy source is fully charged before erasing the images - // Doing this on all the nvdimms since the ones w/o image will need - // to be fully charged before arming the trigger - uint32_t l_poll = 0; + // Check health status registers and exit if required for (const auto & l_nvdimm : i_nvdimmList) { - l_err = nvdimmPollESChargeStatus(l_nvdimm, l_poll); + l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_RESTORE, l_exit ); - if (l_err){ - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP); - errlCommit( l_err, NVDIMM_COMP_ID ); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] failed during health status check", get_huid(l_nvdimm)); + if (l_exit) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else + { + // Redundant check with external err bugged + errlCommit( l_err, NVDIMM_COMP_ID ); + return l_err; + } } + + // Make sure the restore is valid + l_err = nvdimmGetRestoreValid(l_nvdimm, l_rstrValid); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore Target[%X] error validating restore status!", + get_huid(l_nvdimm)); + break; + } + + if ((l_rstrValid & RSTR_SUCCESS) == RSTR_SUCCESS) + { + // Restore success! + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERROR); + } + } }while(0); + // Return err not being handled, temp commit: + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + } + // At the end, pre-load CCS with commands for EPOW. This will stage the CCS // with the require commands to trigger the save on NVDIMMs. The actual // triggering will be done by OCC when EPOW is detected. @@ -1633,6 +1763,7 @@ void nvdimm_restore(TargetHandleList &i_nvdimmList) } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimm_restore()"); + return l_err; } /** @@ -1733,12 +1864,16 @@ errlHndl_t nvdimm_factory_reset(Target *i_nvdimm) * @param[in] i_nvdimm - nvdimm target * */ -void nvdimm_init(Target *i_nvdimm) +errlHndl_t nvdimm_init(Target *i_nvdimm) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimm_init() nvdimm[%X]", get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; + bool l_continue = true; + uint8_t l_data = 0; + nvdimm_reg_t l_RegInfo; + uint32_t l_poll = 0; do { @@ -1759,6 +1894,15 @@ void nvdimm_init(Target *i_nvdimm) } } + // Set ATTR_NV_STATUS_FLAG to default disarmed state + l_err = notifyNvdimmProtectionChange(i_nvdimm, NVDIMM_DISARMED); + if (l_err) + { + nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR); + errlCommit(l_err, NVDIMM_COMP_ID); + } + + // Check if the nvdimm ready status l_err = nvdimmReady(i_nvdimm); if (l_err) @@ -1766,7 +1910,6 @@ void nvdimm_init(Target *i_nvdimm) nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], controller not ready", get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } @@ -1777,46 +1920,163 @@ void nvdimm_init(Target *i_nvdimm) nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], error retrieving timeout values", get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } - //Check save progress - uint32_t l_poll = 0; - l_err = nvdimmPollBackupDone(i_nvdimm, l_poll); + // Check for Erase in progress and its status + l_err = nvdimmEraseCheck(i_nvdimm); + if (l_err) + { + break; + } + // Check NO_RESET_N bit for power loss without save + l_err = nvdimmReadReg ( i_nvdimm, CSAVE_FAIL_INFO1, l_data); if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOPRSV); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], error backing up the DRAM!", - get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } + else if ((l_data & NO_RESET_N) == NO_RESET_N) + { + // Set ATTR_NV_STATUS_FLAG to restored, as data may persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmInit() nvdimm[%X]" + "failed to save due to power loss!",get_huid(i_nvdimm)); + /*@ + *@errortype + *@reasoncode NVDIMM_POWER_SAVE_FAILURE + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_CHECK_RESETN + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@userdata2 + *@devdesc Encountered error erasing previously stored data image + * on NVDIMM. Likely due to timeout and/or controller error + *@custdesc NVDIMM error erasing data image + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_CHECK_RESETN, + NVDIMM_POWER_SAVE_FAILURE, + NVDIMM_SET_USER_DATA_1(l_data, get_huid(i_nvdimm)), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Failure to erase could mean internal NV controller error and/or + // HW error on nand flash. NVDIMM will lose persistency if failed to + // erase nand flash + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); - // Unlock encryption if enabled - TargetHandleList l_nvdimmTargetList; - l_nvdimmTargetList.push_back(i_nvdimm); - NVDIMM::nvdimm_encrypt_unlock(l_nvdimmTargetList); + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); - // Disarm the ddr_resetn here in case it came in armed. When the nvdimm is - // armed the reset_n is masked off from the host, meaning the drams won't - // be able to get reset properly later, causing training to fail. - l_err = nvdimmChangeArmState(i_nvdimm, DISARM_TRIGGER); + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + errlCommit(l_err, NVDIMM_COMP_ID); + } + else + { + // Check save progress + l_err = nvdimmPollBackupDone(i_nvdimm, l_poll); + if (l_err) + { + // May have to move the error handling to the caller + // as different op could have different error severity + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_int() nvdimm[%X], error backing up the DRAM!", + get_huid(i_nvdimm)); + break; + } + } + + // Check CSAVE_ERROR Register + l_err = nvdimmReadReg( i_nvdimm, CSAVE_FAIL_INFO0, l_data ); if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOPRSV); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], error disarming the nvdimm!", - get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); + break; + } + else if (l_data != ZERO) + { + /*@ + *@errortype + *@reasoncode NVDIMM_CSAVE_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_CHECK_CSAVE + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@userdata2 + *@devdesc Encountered error saving during catastrophic save + * on NVDIMM. Check error register trace for details + *@custdesc NVDIMM error during Catastrophic Save + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_CHECK_CSAVE, + NVDIMM_CSAVE_ERROR, + NVDIMM_SET_USER_DATA_1(l_data, get_huid(i_nvdimm)), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + + // Check if the image is still valid + if ( l_RegInfo.CSave_Info != VALID_IMAGE ) + { + // Callout and gard dimm if image is not valid + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + else + { + // Set ATTR_NV_STATUS_FLAG to Restored as data might persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + errlCommit(l_err, NVDIMM_COMP_ID); + } + break; + } + + // Check Health Status Registers + l_err = nvdimmHealthStatusCheck(i_nvdimm, HEALTH_SAVE, l_continue); + if(!l_continue) + { break; } + // Unlock encryption if enabled + TargetHandleList l_nvdimmTargetList; + l_nvdimmTargetList.push_back(i_nvdimm); + NVDIMM::nvdimm_encrypt_unlock(l_nvdimmTargetList); + }while(0); TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimm_init() nvdimm[%X]", get_huid(i_nvdimm)); + + // Return err not being handled, temp commit: + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + } + + + return l_err; } diff --git a/src/usr/isteps/nvdimm/nvdimm.H b/src/usr/isteps/nvdimm/nvdimm.H index 8304486f620..af77866ffa1 100644 --- a/src/usr/isteps/nvdimm/nvdimm.H +++ b/src/usr/isteps/nvdimm/nvdimm.H @@ -343,12 +343,17 @@ enum i2c_out_values : uint8_t CHARGE_IN_PROGRESS = 0x01, SAVE_SUCCESS = 0x01, RSTR_SUCCESS = 0X01, - ARM_SUCCESS = 0X09, + ARM_SUCCESS = 0X01, ERASE_SUCCESS = 0X01, ES_SUCCESS = 0x05, CHARGE_SUCCESS = 0x00, NV_READY = 0xA5, FACTORY_RESET_IN_PROGRESS = 0x03, + NO_RESET_N = 0x20, + RESET_N_ARMED = 0x08, + ES_POLICY_ERROR = 0x02, + ARM_ERROR = 0X02, + RSTR_ERROR = 0x02, }; // Timeout-related enum @@ -422,6 +427,49 @@ union scap_status_union typedef scap_status_union scap_status_register_t; +// Bits in Health Status Check Registers +enum health_status : uint8_t +{ + // Module Health Status0 + VOLTAGE_REGULATOR_FAILED = 0x01, + VDD_LOST = 0x02, + VPP_LOST = 0x04, + VTT_LOST = 0x08, + DRAM_NOT_SELF_REFRESH = 0x10, + CONTROLLER_HARDWARE_ERROR = 0x20, + NVM_CONTROLLER_ERROR = 0x40, + NVM_LIFETIME_ERROR = 0x80, + // Module Health Status1 + NOT_ENOUGH_ENERGY_FOR_CSAVE = 0x01, + INVALID_FIRMWARE_ERROR = 0x02, + CONFIG_DATA_ERROR = 0x04, + NO_ES_PRESENT = 0x08, + ES_POLICY_NOT_SET = 0x10, + ES_HARDWARE_FAILURE = 0x20, + ES_HEALTH_ASSESSMENT_ERROR = 0x40, + // Error Threshold Status + ES_LIFETIME_ERROR = 0x02, + ES_TEMP_ERROR = 0x04, +}; + +// Int representation for health status function call +enum health_function : uint8_t +{ + HEALTH_SAVE = 0x01, + HEALTH_RESTORE = 0x02, + HEALTH_UPDATE = 0x03, + HEALTH_PRE_ARM = 0x04, + HEALTH_POST_ARM = 0x05, +}; + +// Event notification register values +enum event_n : uint8_t +{ + PERSISTENCY_NOTIFICATION = 0x01, + SET_EVENT_NOTIFICATION_ERROR = 0x02, + PERSISTENCY_ENABLED = 0x04, +}; + /** * @brief Wrapper to call deviceOp to read the NV controller via I2C * @@ -504,7 +552,6 @@ errlHndl_t nvdimmPollStatus(TARGETING::Target *i_nvdimm, ops_id i_ops_id, uint32 */ errlHndl_t nvdimmSetESPolicy(TARGETING::Target* i_nvdimm); - /** * @brief Helper function to handle conflicting attribute keys * @@ -631,6 +678,18 @@ errlHndl_t nvdimm_getTPM(TARGETING::Target*& o_tpm); #endif +/** + * @brief This function checks for valid image on the given target + * + * @param[in] i_nvdimm - nvdimm target with NV controller + * + * @param[out] o_imgValid - return true if the target has a valid image + * + * @return errlHndl_t - Null if successful, otherwise a pointer to + * the error log. + */ +errlHndl_t nvdimmValidImage(TARGETING::Target *i_nvdimm, bool &o_imgValid); + } //End NVDIMM namespace diff --git a/src/usr/isteps/nvdimm/nvdimm.mk b/src/usr/isteps/nvdimm/nvdimm.mk index f26c8232b87..d9418b4144b 100644 --- a/src/usr/isteps/nvdimm/nvdimm.mk +++ b/src/usr/isteps/nvdimm/nvdimm.mk @@ -47,6 +47,7 @@ EXTRAINCDIR += ${PROCEDURE_PATH}/hwp/ffdc/ OBJS += nvdimm.o OBJS += nvdimmdd.o OBJS += errlud_nvdimm.o +OBJS += nvdimmErrorLog.o ifneq (${HOSTBOOT_RUNTIME},1) diff --git a/src/usr/isteps/nvdimm/nvdimmErrorLog.C b/src/usr/isteps/nvdimm/nvdimmErrorLog.C new file mode 100644 index 00000000000..57984bb9734 --- /dev/null +++ b/src/usr/isteps/nvdimm/nvdimmErrorLog.C @@ -0,0 +1,1313 @@ +/* IBM_PROLOG_BEGIN_TAG */ +/* This is an automatically generated prolog. */ +/* */ +/* $Source: src/usr/isteps/nvdimm/nvdimmErrorLog.C $ */ +/* */ +/* OpenPOWER HostBoot Project */ +/* */ +/* Contributors Listed Below - COPYRIGHT 2014,2019 */ +/* [+] International Business Machines Corp. */ +/* */ +/* */ +/* Licensed under the Apache License, Version 2.0 (the "License"); */ +/* you may not use this file except in compliance with the License. */ +/* You may obtain a copy of the License at */ +/* */ +/* http://www.apache.org/licenses/LICENSE-2.0 */ +/* */ +/* Unless required by applicable law or agreed to in writing, software */ +/* distributed under the License is distributed on an "AS IS" BASIS, */ +/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ +/* implied. See the License for the specific language governing */ +/* permissions and limitations under the License. */ +/* */ +/* IBM_PROLOG_END_TAG */ + +#include "nvdimm.H" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "errlud_nvdimm.H" + +using namespace TARGETING; + +namespace NVDIMM +{ + +/** + * @brief Read and save various status registers needed for error log traces + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[out] o_RegInfo - struct to hold register data + * + */ +void nvdimmTraceRegs(Target *i_nvdimm, nvdimm_reg_t& o_RegInfo) +{ + uint8_t l_data = 0x0; + errlHndl_t l_err = nullptr; + + // Read MODULE HEALTH register + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Module_Health = l_data; + + // Read MODULE HEALTH STATUS0 register + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Module_Health_Status0 = l_data; + + // Read MODULE HEALTH STATUS1 register + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Module_Health_Status1 = l_data; + + // Read CSAVE STATUS register + l_err = nvdimmReadReg(i_nvdimm, CSAVE_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.CSave_Status = l_data; + + // Read CSAVE INFO register + l_err = nvdimmReadReg(i_nvdimm, CSAVE_INFO, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.CSave_Info = l_data; + + // Read CSAVE FAIL INFO0 register + l_err = nvdimmReadReg(i_nvdimm, CSAVE_FAIL_INFO0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.CSave_Fail_Info0 = l_data; + + // Read CSAVE FAIL INFO1 register + l_err = nvdimmReadReg(i_nvdimm, CSAVE_FAIL_INFO1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.CSave_Fail_Info1 = l_data; + + // Read CSAVE TIMEOUT0 register + l_err = nvdimmReadReg(i_nvdimm, CSAVE_TIMEOUT0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.CSave_Timeout0 = l_data; + + // Read CSAVE TIMEOUT1 register + l_err = nvdimmReadReg(i_nvdimm, CSAVE_TIMEOUT1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.CSave_Timeout1 = l_data; + + // Read ERROR THRESHOLD STATUS register + l_err = nvdimmReadReg(i_nvdimm, ERROR_THRESHOLD_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Error_Threshold_Status = l_data; + + // Read NVDIMM READY register + l_err = nvdimmReadReg(i_nvdimm, NVDIMM_READY, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.NVDimm_Ready = l_data; + + // Read NVDIMM CMD STATUS0 register + l_err = nvdimmReadReg(i_nvdimm, NVDIMM_CMD_STATUS0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.NVDimm_CMD_Status0 = l_data; + + // Read ERASE STATUS register + l_err = nvdimmReadReg(i_nvdimm, ERASE_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Erase_Status = l_data; + + // Read ERASE TIMEOUT0 register + l_err = nvdimmReadReg(i_nvdimm, ERASE_TIMEOUT0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Erase_Timeout0 = l_data; + + // Read ERASE TIMEOUT1 register + l_err = nvdimmReadReg(i_nvdimm, ERASE_TIMEOUT1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Erase_Timeout1 = l_data; + + // Read ABORT CMD TIMEOUT register + l_err = nvdimmReadReg(i_nvdimm, ABORT_CMD_TIMEOUT, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Abort_CMD_Timeout = l_data; + + // Read SET ES POLICY STATUS register + l_err = nvdimmReadReg(i_nvdimm, SET_ES_POLICY_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Set_ES_Policy_Status = l_data; + + // Read RESTORE STATUS register + l_err = nvdimmReadReg(i_nvdimm, RESTORE_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Restore_Status = l_data; + + // Read RESTORE FAIL INFO register + l_err = nvdimmReadReg(i_nvdimm, RESTORE_FAIL_INFO, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Restore_Fail_Info = l_data; + + // Read RESTORE TIMEOUT0 register + l_err = nvdimmReadReg(i_nvdimm, RESTORE_TIMEOUT0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Restore_Timeout0 = l_data; + + // Read RESTORE TIMEOUT1 register + l_err = nvdimmReadReg(i_nvdimm, RESTORE_TIMEOUT1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Restore_Timeout1 = l_data; + + // Read ARM STATUS register + l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Arm_Status = l_data; + + // Read ARM TIMEOUT0 register + l_err = nvdimmReadReg(i_nvdimm, ARM_TIMEOUT0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Arm_Timeout0 = l_data; + + // Read ARM TIMEOUT1 register + l_err = nvdimmReadReg(i_nvdimm, ARM_TIMEOUT1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Arm_Timeout1 = l_data; + + // Read SET EVENT NOTIFICATION STATUS register + l_err = nvdimmReadReg(i_nvdimm, SET_EVENT_NOTIFICATION_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + o_RegInfo.Set_Event_Notification_Status = l_data; +} + +/** + * @brief Helper function for standard callout of an NVDIMM + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[in] i_step - the nvdimm function calling the health check + * + * @param[out] o_err - error log handler to be modified + * + * @return bool - true to commit log and continue, false to return + * the error log to caller and exit. + */ +bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err) +{ + bool l_continue = true; + uint8_t l_data; + errlHndl_t l_err = nullptr; + + // Check which callout check is necessary + switch(i_step) + { + // Post save errors always continue with callouts + case HEALTH_SAVE: + { + // Check to see if the nvdimm image is still valid + l_err = nvdimmValidImage(i_nvdimm, l_continue); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // Checkout image validity and set dimm status accordingly + if(l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + else + { + // Callout, deconfig and gard the dimm + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + + break; + } + + // Post restore errors always continue with callouts + case HEALTH_RESTORE: + { + // Check restore status + l_err = nvdimmReadReg(i_nvdimm, RESTORE_STATUS, l_data); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else if ((l_data & RSTR_SUCCESS) != RSTR_SUCCESS) + { + l_continue = false; + } + + // Check restore status and set dimm status accordingly + if(l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + } + else + { + // Callout, deconfig and gard the dimm + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + + break; + } + + // Post ARM errors need check for arm success + case HEALTH_PRE_ARM: + { + + // Check arm status + l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else if (((l_data & ARM_SUCCESS) != ARM_SUCCESS) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED)) + { + l_continue = true; + } + + // Check arm status and set dimm status accordingly + if(!l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + else + { + // Set ATTR_NV_STATUS_FLAG to dimm diarmed + l_err = notifyNvdimmProtectionChange(i_nvdimm, NVDIMM_DISARMED); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // Callout, deconfig and gard the dimm + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::NO_DECONFIG, + HWAS::GARD_Fatal); + } + + break; + } + + // Post ARM errors need check for arm success + case HEALTH_POST_ARM: + { + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + // Set ATTR_NV_STATUS_FLAG to restored as data may persist despite errors + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + break; + } + + } + + return l_continue; +} + +/** + * @brief Helper function for BPM/Cable high, NVDIMM low callout + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[in] i_step - the nvdimm function calling the health check + * + * @param[out] o_err - error log handler to be modified + * + * @return bool - true to commit log and continue, false to return + * the error log to caller and exit. + */ +bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err) +{ + bool l_continue = true; + uint8_t l_data; + errlHndl_t l_err = nullptr; + + // Check which callout check is necessary + switch(i_step) + { + // Post save errors always continue with callouts + case HEALTH_SAVE: + { + // Check to see if the nvdimm image is still valid + l_err = nvdimmValidImage(i_nvdimm, l_continue); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // Callout BPM and Cable but cannot deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_CABLE_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Check image validity and set dimm status accordingly + if(l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + else + { + // Callout dimm, deconfig and gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + + break; + } + + // Post restore errors always continue with callouts + case HEALTH_RESTORE: + { + // Check restore status + l_err = nvdimmReadReg(i_nvdimm, RESTORE_STATUS, l_data); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else if ((l_data & RSTR_SUCCESS) != RSTR_SUCCESS) + { + l_continue = false; + } + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_CABLE_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Check restore status and set dimm status accordingly + if(l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + else + { + // Callout dimm, deconfig and gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + + break; + } + + // Post ARM errors need check for arm success + case HEALTH_PRE_ARM: + { + // Check arm status + l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else if (((l_data & ARM_SUCCESS) != ARM_SUCCESS) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED)) + { + l_continue = true; + } + + // Callout BPM and Cable but cannot deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_CABLE_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Check arm status and set dimm status accordingly + if(!l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + else + { + // Set ATTR_NV_STATUS_FLAG to dimm diarmed + l_err = notifyNvdimmProtectionChange(i_nvdimm, NVDIMM_DISARMED); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + // Callout dimm, deconfig and gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + + break; + } + + // Post ARM errors need check for arm success + case HEALTH_POST_ARM: + { + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_CABLE_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + break; + } + + } + + return l_continue; +} + +/** + * @brief Helper function for BPM high, NVDIMM low callout + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[in] i_step - the nvdimm function calling the health check + * + * @param[out] o_err - error log handler to be modified + * + * @return bool - true to commit log and continue, false to return + * the error log to caller and exit. + */ +bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err) +{ + bool l_continue = true; + uint8_t l_data; + errlHndl_t l_err = nullptr; + + // Check which callout check is necessary + switch(i_step) + { + // Post save errors always continue with callouts + case HEALTH_SAVE: + { + // Callout BPM on high + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + break; + } + + // Post restore errors always continue with callouts + case HEALTH_RESTORE: + { + // Callout BPM on high + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + break; + } + + // Post ARM errors need check for arm success + case HEALTH_PRE_ARM: + { + // Check arm status + l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else if (((l_data & ARM_SUCCESS) != ARM_SUCCESS) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED)) + { + l_continue = true; + } + + // Callout BPM on high + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + // Check arm status and set dimm status accordingly + if(!l_continue) + { + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + } + else + { + // Set ATTR_NV_STATUS_FLAG to dimm diarmed + l_err = notifyNvdimmProtectionChange(i_nvdimm, NVDIMM_DISARMED); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + } + + break; + } + + // Post ARM errors need check for arm success + case HEALTH_POST_ARM: + { + // Callout BPM on high + o_err->addPartCallout( i_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Callout dimm but do not deconfig or gard + o_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + // Set ATTR_NV_STATUS_FLAG to restored as data may still persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + + break; + } + + } + + return l_continue; +} + +/** + * @brief Function checking the Health Status Registers for an nvdimm + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[in] i_step - the nvdimm step calling the check + * + * @param[out] o_continue - bool to signal a return to caller fail + * + * @return errlHndl_t - Null if successful, otherwise a pointer to + * the error log. + */ +errlHndl_t nvdimmHealthStatusCheck(Target *i_nvdimm, uint8_t i_step, bool& o_continue) +{ + uint8_t l_data = 0x0; + errlHndl_t l_err = nullptr; + errlHndl_t l_err_t = nullptr; + nvdimm_reg_t l_RegInfo; + bool l_arm_timeout = false; + + if (i_step == HEALTH_PRE_ARM) + { + l_arm_timeout = o_continue; + } + + //Collect Register data for parsing and traces + nvdimmTraceRegs(i_nvdimm, l_RegInfo); + + // Read SET_EVENT_NOTIFICATION_STATUS register + l_err = nvdimmReadReg(i_nvdimm, SET_EVENT_NOTIFICATION_STATUS, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + l_RegInfo.Set_Event_Notification_Status = l_data; + + // Read RESTORE STATUS register + l_err = nvdimmReadReg(i_nvdimm, RESTORE_STATUS , l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + l_RegInfo.Restore_Status = l_data; + + // Read RESTORE_FAIL_INFO register + l_err = nvdimmReadReg(i_nvdimm, RESTORE_FAIL_INFO , l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + l_RegInfo.Restore_Fail_Info = l_data; + + // Read NVDIMM_CMD_STATUS0 register + l_err = nvdimmReadReg(i_nvdimm, NVDIMM_CMD_STATUS0 , l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + l_RegInfo.NVDimm_CMD_Status0 = l_data; + + // Read ARM_STATUS register + l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS , l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + l_RegInfo.Arm_Status = l_data; + + // Read SET_ES_POLICY_STATUS register + l_err = nvdimmReadReg(i_nvdimm, SET_ES_POLICY_STATUS , l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + l_RegInfo.Set_ES_Policy_Status = l_data; + + // Check all nvdimm deconfig cases + do + { + // Check MODULE_HEALTH_STATUS0[0] + if ((l_RegInfo.Module_Health_Status0 & VOLTAGE_REGULATOR_FAILED) == VOLTAGE_REGULATOR_FAILED) + { + /*@ + *@errortype + *@reasoncode NVDIMM_VOLTAGE_REGULATOR_FAILED + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * voltage regulator failure + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_VOLTAGE_REGULATOR_FAILED, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS0[1] + if ((l_RegInfo.Module_Health_Status0 & VDD_LOST) == VDD_LOST) + { + /*@ + *@errortype + *@reasoncode NVDIMM_VDD_LOST + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * vdd loss + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_VDD_LOST, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS0[2] + if ((l_RegInfo.Module_Health_Status0 & VPP_LOST) == VPP_LOST) + { + /*@ + *@errortype + *@reasoncode NVDIMM_VPP_LOST + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * vpp loss + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_VPP_LOST, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS0[3] + if ((l_RegInfo.Module_Health_Status0 & VTT_LOST) == VTT_LOST) + { + /*@ + *@errortype + *@reasoncode NVDIMM_VTT_LOST + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * vtt loss + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_VTT_LOST, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS0[4] + if ((l_RegInfo.Module_Health_Status0 & DRAM_NOT_SELF_REFRESH) == DRAM_NOT_SELF_REFRESH) + { + /*@ + *@errortype + *@reasoncode NVDIMM_DRAM_NOT_SELF_REFRESH + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * no self refresh on the nvdimm + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_DRAM_NOT_SELF_REFRESH, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS0[5] + if ((l_RegInfo.Module_Health_Status0 & CONTROLLER_HARDWARE_ERROR) == CONTROLLER_HARDWARE_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_CONTROLLER_HARDWARE_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * error with the hardware controller + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_CONTROLLER_HARDWARE_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS0[6] + if ((l_RegInfo.Module_Health_Status0 & NVM_CONTROLLER_ERROR) == NVM_CONTROLLER_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_NVM_CONTROLLER_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * error with the nvdimm controller + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_NVM_CONTROLLER_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + + // Check MODULE_HEALTH_STATUS0[7] + if ((l_RegInfo.Module_Health_Status0 & NVM_LIFETIME_ERROR) == NVM_LIFETIME_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_NVM_LIFETIME_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * an nvdimm lifetime error + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_NVM_LIFETIME_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS1[1] + if ((l_RegInfo.Module_Health_Status1 & INVALID_FIRMWARE_ERROR) == INVALID_FIRMWARE_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_INVALID_FIRMWARE_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * an invalid firmware image + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_INVALID_FIRMWARE_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS1[2] + if ((l_RegInfo.Module_Health_Status1 & CONFIG_DATA_ERROR) == CONFIG_DATA_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_CONFIG_DATA_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * invalid configuration data + *@custdesc NVDIMM failed module health status check + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_CONFIG_DATA_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + }while(0); + + if (l_err) + { + // Setup Trace + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + + // Callout nvdimm depending on istep call + o_continue &= nvdimmCalloutDimm(i_nvdimm, i_step, l_err); + + if(l_arm_timeout) + { + // Callout, deconfig and gard the dimm + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW, + HWAS::NO_DECONFIG, + HWAS::GARD_Fatal); + } + } + + // Check all BPM and Cable high, nvdimm low cases + do + { + // If function calling is SAVE, ignore NOT_ENOUGH_ENERGY_FOR_CSAVE + if (i_step == HEALTH_SAVE) + { + // Check MODULE_HEALTH_STATUS1[0] + if ((l_RegInfo.Module_Health_Status1 & NOT_ENOUGH_ENERGY_FOR_CSAVE) == NOT_ENOUGH_ENERGY_FOR_CSAVE) + { + /*@ + *@errortype + *@reasoncode NVDIMM_NOT_ENOUGH_ENERGY_FOR_CSAVE + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * insufficient energy for csave + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_NOT_ENOUGH_ENERGY_FOR_CSAVE, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + } + + // Check MODULE_HEALTH_STATUS1[3] + if ((l_RegInfo.Module_Health_Status1 & NO_ES_PRESENT) == NO_ES_PRESENT) + { + /*@ + *@errortype + *@reasoncode NVDIMM_NO_ES_PRESENT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * no ES active + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_NO_ES_PRESENT, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS1[5] + if ((l_RegInfo.Module_Health_Status1 & ES_HARDWARE_FAILURE) == ES_HARDWARE_FAILURE) + { + /*@ + *@errortype + *@reasoncode NVDIMM_ES_HARDWARE_FAILURE + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * ES hardware failure + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_ES_HARDWARE_FAILURE, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check MODULE_HEALTH_STATUS1[6] + if ((l_RegInfo.Module_Health_Status1 & ES_HEALTH_ASSESSMENT_ERROR) == ES_HEALTH_ASSESSMENT_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_ES_HEALTH_ASSESSMENT_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * ES error during health assessment + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_ES_HEALTH_ASSESSMENT_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + }while(0); + + if (l_err_t) + { + // Setup Trace + l_err_t->collectTrace( NVDIMM_COMP_NAME ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err_t); + + // Callout BPM, Cable, and nvdimm + o_continue &= nvdimmBPMCableCallout(i_nvdimm, i_step, l_err_t); + } + + // Check for multiple errors and commit old error + if ((l_err) && (l_err_t)) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // If there was a new error, save off to l_err + if (l_err_t) + { + l_err = l_err_t; + l_err_t = nullptr; + } + + // Check all BPM high, nvdimm low cases + do + { + // Check ERROR_THRESHOLD_STATUS[1] + if ((l_RegInfo.Error_Threshold_Status & ES_LIFETIME_ERROR) == ES_LIFETIME_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_ES_LIFETIME_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * ES lifetime error + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_ES_LIFETIME_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + // Check ERROR_THRESHOLD_STATUS[2] + if ((l_RegInfo.Error_Threshold_Status & ES_TEMP_ERROR) == ES_TEMP_ERROR) + { + /*@ + *@errortype + *@reasoncode NVDIMM_ES_TEMP_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * ES temporary error + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_ES_TEMP_ERROR, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + break; + } + + }while(0); + + if (l_err_t) + { + // Setup Trace + l_err_t->collectTrace( NVDIMM_COMP_NAME ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err_t); + + // Callout nvdimm + o_continue &= nvdimmBPMCallout(i_nvdimm, i_step, l_err_t); + } + + // Check for multiple errors and commit old error + if ((l_err) && (l_err_t)) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // If there was a new error, save off to l_err + if (l_err_t) + { + l_err = l_err_t; + l_err_t = nullptr; + } + + // Check special pre arm case + if (i_step == HEALTH_PRE_ARM) + { + // Check ES_POLICY_NOT_SET[4] + if ((l_RegInfo.Set_ES_Policy_Status & ES_POLICY_NOT_SET) == ES_POLICY_NOT_SET) + { + /*@ + *@errortype + *@reasoncode NVDIMM_ES_POLICY_NOT_SET + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_MODULE_HEALTH_STATUS_CHECK + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM failed module health status check due to + * ES policy not being set during an arm + *@custdesc NVDIMM failed module health status check + */ + l_err_t = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_MODULE_HEALTH_STATUS_CHECK, + NVDIMM_ES_POLICY_NOT_SET, + TARGETING::get_huid(i_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + o_continue = true; + // Callout dimm but no deconfig and gard + l_err_t->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + } + + // Check for multiple errors and commit old error + if ((l_err) && (l_err_t)) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // If there was a new error, save off to l_err + if (l_err_t) + { + l_err = l_err_t; + l_err_t = nullptr; + } + + return l_err; +} + +} // end NVDIMM namespace diff --git a/src/usr/isteps/nvdimm/nvdimmErrorLog.H b/src/usr/isteps/nvdimm/nvdimmErrorLog.H new file mode 100644 index 00000000000..dae8e2f2f5e --- /dev/null +++ b/src/usr/isteps/nvdimm/nvdimmErrorLog.H @@ -0,0 +1,108 @@ +/* IBM_PROLOG_BEGIN_TAG */ +/* This is an automatically generated prolog. */ +/* */ +/* $Source: src/usr/isteps/nvdimm/nvdimmErrorLog.H $ */ +/* */ +/* OpenPOWER HostBoot Project */ +/* */ +/* Contributors Listed Below - COPYRIGHT 2014,2019 */ +/* [+] International Business Machines Corp. */ +/* */ +/* */ +/* Licensed under the Apache License, Version 2.0 (the "License"); */ +/* you may not use this file except in compliance with the License. */ +/* You may obtain a copy of the License at */ +/* */ +/* http://www.apache.org/licenses/LICENSE-2.0 */ +/* */ +/* Unless required by applicable law or agreed to in writing, software */ +/* distributed under the License is distributed on an "AS IS" BASIS, */ +/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ +/* implied. See the License for the specific language governing */ +/* permissions and limitations under the License. */ +/* */ +/* IBM_PROLOG_END_TAG */ + +#ifndef NVDIMM_ERROR_LOG_H__ +#define NVDIMM_ERROR_LOG_H__ + +#include +#include +#include +#include +#include +#include +#include "nvdimmdd.H" +#include "nvdimm.H" + +using namespace TARGETING; +using namespace EEPROM; + +// Trace definition +extern trace_desc_t* g_trac_nvdimm; + +namespace NVDIMM +{ + +/** + * @brief Function to read and save status registers for traces + * + * @param[in] i_nvdimm - nvdimm target with NV controller + * + * @param[out] o_RegInfo - the structure holding the register data + * + */ +void nvdimmTraceRegs(Target *i_nvdimm, nvdimm_reg_t& o_RegInfo); + +/** + * @brief Helper function for standard callout of an NVDIMM + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[out] o_err - error log handler to be modified + * + * @return bool - true to commit log and continue, false to return + * the error log to caller and exit. + */ +bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err); + +/** + * @brief Helper function for BPM/Cable high, NVDIMM low callout + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[out] o_err - error log handler to be modified + * + * @return bool - true to commit log and continue, false to return + * the error log to caller and exit. + */ +bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err); + +/** + * @brief Helper function for BPM high, NVDIMM low callout + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[out] o_err - error log handler to be modified + * + * @return bool - true to commit log and continue, false to return + * the error log to caller and exit. + */ +bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err); + +/** + * @brief Function checking the Health Status Registers for an nvdimm + * + * @param[in] i_nvdimm - nvdimm target + * + * @param[out] o_exit - bool to signify exit procedure + * + * @return errlHndl_t - Null if successful, otherwise a pointer to + * the error log. + */ +errlHndl_t nvdimmHealthStatusCheck(Target *i_nvdimm, uint8_t i_step, bool& o_continue); + +} //End NVDIMM namespace + + +#endif // NVDIMM_ERROR_LOG_H__ diff --git a/src/usr/isteps/nvdimm/nvdimmdd.C b/src/usr/isteps/nvdimm/nvdimmdd.C index 695a60b9346..ce6fa65d833 100755 --- a/src/usr/isteps/nvdimm/nvdimmdd.C +++ b/src/usr/isteps/nvdimm/nvdimmdd.C @@ -79,6 +79,7 @@ TRAC_INIT( & g_trac_nvdimmr, "NVDIMMR", KILOBYTE ); #define MAX_READ_RETRY_SECS 30 // ---------------------------------------------- +using namespace TARGETING; namespace { diff --git a/src/usr/isteps/nvdimm/nvdimmdd.H b/src/usr/isteps/nvdimm/nvdimmdd.H index 1e299f2deac..37fa4a18854 100755 --- a/src/usr/isteps/nvdimm/nvdimmdd.H +++ b/src/usr/isteps/nvdimm/nvdimmdd.H @@ -92,6 +92,78 @@ struct nvdimm_addr_t } }; +/** + * @brief Structure of registers for error log traces + */ +struct nvdimm_reg_t +{ + uint8_t Module_Health; + uint8_t Module_Health_Status0; + uint8_t Module_Health_Status1; + uint8_t CSave_Status; + uint8_t CSave_Info; + uint8_t CSave_Fail_Info0; + uint8_t CSave_Fail_Info1; + uint8_t CSave_Timeout0; + uint8_t CSave_Timeout1; + uint8_t Error_Threshold_Status; + uint8_t NVDimm_Ready; + uint8_t NVDimm_CMD_Status0; + uint8_t Erase_Status; + uint8_t Erase_Timeout0; + uint8_t Erase_Timeout1; + uint8_t Abort_CMD_Timeout; + uint8_t Set_ES_Policy_Status; + uint8_t Restore_Status; + uint8_t Restore_Fail_Info; + uint8_t Restore_Timeout0; + uint8_t Restore_Timeout1; + uint8_t Arm_Status; + uint8_t Arm_Timeout0; + uint8_t Arm_Timeout1; + uint8_t Set_Event_Notification_Status; + + /** + * @brief Construct a default nvdimm_reg_t + */ + nvdimm_reg_t() + : Module_Health(0), + Module_Health_Status0(0), + Module_Health_Status1(0), + CSave_Status(0), + CSave_Info(0), + CSave_Fail_Info0(0), + CSave_Fail_Info1(0), + CSave_Timeout0(0), + CSave_Timeout1(0), + Error_Threshold_Status(0), + NVDimm_Ready(0), + NVDimm_CMD_Status0(0), + Erase_Status(0), + Erase_Timeout0(0), + Erase_Timeout1(0), + Abort_CMD_Timeout(0), + Set_ES_Policy_Status(0), + Restore_Status(0), + Restore_Fail_Info(0), + Restore_Timeout0(0), + Restore_Timeout1(0), + Arm_Status(0), + Arm_Timeout0(0), + Arm_Timeout1(0), + Set_Event_Notification_Status(0) + + { + } + + /** + * @brief Default deconstructor of nvdimm_reg_t + */ + ~nvdimm_reg_t() = default; + +}; + + /* * @brief Miscellaneous enums for NVDIMM */ diff --git a/src/usr/isteps/nvdimm/plugins/errludP_nvdimm.H b/src/usr/isteps/nvdimm/plugins/errludP_nvdimm.H index 460add6f3b4..2c7f1d2a02d 100644 --- a/src/usr/isteps/nvdimm/plugins/errludP_nvdimm.H +++ b/src/usr/isteps/nvdimm/plugins/errludP_nvdimm.H @@ -164,6 +164,119 @@ private: UdParserNvdimmParms & operator=(const UdParserNvdimmParms&); }; +/** + * @class UdParserNvdimmOPParms + * + * Parses UdNvdimmOPParms + */ +class UdParserNvdimmOPParms : public ERRORLOG::ErrlUserDetailsParser +{ +public: + /** + * @brief Constructor + */ + UdParserNvdimmOPParms() {} + + /** + * @brief Destructor + */ + virtual ~UdParserNvdimmOPParms() = default; + + /** + * @brief Parses string user detail data from an error log + * + * @param i_version Version of the data + * @param i_parse ErrlUsrParser object for outputting information + * @param i_pBuffer Pointer to buffer containing detail data + * @param i_buflen Length of the buffer + */ + virtual void parse(errlver_t i_version, + ErrlUsrParser & i_parser, + void * i_pBuffer, + const uint32_t i_buflen) const + { + char* l_databuf = static_cast(i_pBuffer); + i_parser.PrintHeading("NVDIMM I2C Register Traces"); + + //***** Memorr Layout ***** + // 1 byte : MODULE_HEALTH + // 1 byte : MODULE_HEALTH_STATUS0 + // 1 byte : MODULE_HEALTH_STATUS1 + // 1 byte : CSAVE_STATUS + // 1 byte : CSAVE_INFO + // 1 byte : CSAVE_FAIL_INFO0 + // 1 byte : CSAVE_FAIL_INFO1 + // 1 byte : ERROR_THRESHOLD_STATUS + // 1 byte : NVDIMM_READY + // 1 byte : NVDIMM_CMD_STATUS0 + // 1 byte : ERASE_STATUS + // 1 byte : ERASE_TIMEOUT0 + // 1 byte : ERASE_TIMEOUT1 + // 1 byte : ABORT_CMD_TIMEOUT + // 1 byte : SET_ES_POLICY_STATUS + // 1 byte : RESTORE_STATUS + // 1 byte : RESTORE_FAIL_INFO + // 1 byte : RESTORE_TIMEOUT0 + // 1 byte : RESTORE_TIMEOUT1 + // 1 byte : ARM_STATUS + // 1 byte : ARM_TIMEOUT0 + // 1 byte : ARM_TIMEOUT1 + // 1 byte : SET_EVENT_NOTIFICATION_STATUS + // + + i_parser.PrintNumber("Module Health Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Module Health Status0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Module Health Status1 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("CSave Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("CSave Info Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("CSave Fail Info0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("CSave Fail Info1 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Error Threshold Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("NVDIMM Ready Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("NVDIMM CMD Status0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Erase Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Erase Timeout0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Erase Timeout1 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Abort CMD Timeout Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Set ES Policy Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Restore Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Restore Fail Info0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Restore Timeout0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Restore Timeout1 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Arm Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Arm Timeout0 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Arm Timeout1 Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + i_parser.PrintNumber("Set Event Notification Status Register: ","%.2lX",TO_UINT8(l_databuf)); + ++l_databuf; + } + + // Disabled + UdParserNvdimmOPParms(const UdParserNvdimmOPParms&) = delete; + UdParserNvdimmOPParms & operator=(UdParserNvdimmOPParms &) = delete; +}; + } // end NVDIMM namespace #endif diff --git a/src/usr/isteps/nvdimm/plugins/nvdimmUdParserFactory.H b/src/usr/isteps/nvdimm/plugins/nvdimmUdParserFactory.H index b27774b13e2..f208ac06026 100644 --- a/src/usr/isteps/nvdimm/plugins/nvdimmUdParserFactory.H +++ b/src/usr/isteps/nvdimm/plugins/nvdimmUdParserFactory.H @@ -38,14 +38,14 @@ namespace NVDIMM { registerParser (NVDIMM_UDT_PARAMETERS); + registerParser + (NVDIMM_OP_PARAMETERS); } - private: - - UserDetailsParserFactory(const UserDetailsParserFactory &); - UserDetailsParserFactory & operator= - (const UserDetailsParserFactory &); + UserDetailsParserFactory(const UserDetailsParserFactory &) = delete; + UserDetailsParserFactory & operator=(UserDetailsParserFactory &) = delete; }; + }; #endif diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C index d3a9d41a4eb..d5432712c90 100644 --- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C +++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C @@ -41,7 +41,10 @@ #include #include #include +#include #include +#include "../errlud_nvdimm.H" +#include "../nvdimmErrorLog.H" #include // implements some of these #include "../nvdimm.H" // for g_trac_nvdimm @@ -88,11 +91,12 @@ errlHndl_t nvdimmPollArmDone(Target* i_nvdimm, * the trigger has been armed to ddr_reset_n * * @param[in] i_nvdimm - nvdimm target with NV controller + * @param[in] i_arm_timeout - nvdimm local timeout status * * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ -errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm) +errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm, bool i_arm_timeout) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckArmSuccess() nvdimm[%X]", get_huid(i_nvdimm)); @@ -107,7 +111,7 @@ errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm) TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]" "failed to read arm status reg!",get_huid(i_nvdimm)); } - else if ((l_data & ARM_SUCCESS) != ARM_SUCCESS) + else if (((l_data & ARM_ERROR) == ARM_ERROR) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED) || i_arm_timeout) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]" @@ -140,13 +144,9 @@ errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm) // if failed to arm trigger l_err->addPartCallout( i_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_PART_TYPE, - HWAS::SRCI_PRIORITY_MED); - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_CABLE_PART_TYPE, - HWAS::SRCI_PRIORITY_MED); + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckArmSuccess() nvdimm[%X] ret[%X]", @@ -158,14 +158,21 @@ errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm) bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) { bool o_arm_successful = true; + bool l_continue = true; + bool l_arm_timeout = false; + uint8_t l_data; + auto l_RegInfo = nvdimm_reg_t(); TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() %d", i_nvdimmTargetList.size()); errlHndl_t l_err = nullptr; + errlHndl_t l_err_t = nullptr; for (auto const l_nvdimm : i_nvdimmTargetList) { + l_arm_timeout = false; + // skip if the nvdimm is already armed ATTR_NVDIMM_ARMED_type l_armed_state = {}; l_armed_state = l_nvdimm->getAttr(); @@ -175,27 +182,35 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) continue; } - // skip if the nvdimm is in error state - if (NVDIMM::nvdimmInErrorState(l_nvdimm)) - { - // error state means arming not successful - o_arm_successful = false; - continue; - } - + // Set ES Policy, contains all of its status checks l_err = nvdimmSetESPolicy(l_nvdimm); if (l_err) { + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set ES Policy", get_huid(l_nvdimm)); o_arm_successful = false; - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP); + + nvdimmDisarm(i_nvdimmTargetList); + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); + + // Callout nvdimm on high, gard and deconfig + l_err->addPartCallout( l_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_Fatal); + errlCommit( l_err, NVDIMM_COMP_ID ); - continue; + break; } l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER); @@ -205,7 +220,14 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) // salvage the data if (l_err) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm)); + + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } + // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module @@ -221,29 +243,78 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) l_err = nvdimmPollArmDone(l_nvdimm, l_poll); if (l_err) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm)); + l_arm_timeout = true; + + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } + // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit( l_err, NVDIMM_COMP_ID ); o_arm_successful = false; - continue; } - l_err = nvdimmCheckArmSuccess(l_nvdimm); + // Check health status registers and exit if required + l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_arm_timeout ); + + // Check for health status failure + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm)); + if (!l_continue) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + + // Disarming all dimms due to error + nvdimmDisarm(i_nvdimmTargetList); + + o_arm_successful = false; + break; + } + else + { + errlCommit( l_err, NVDIMM_COMP_ID ); + continue; + } + } + + l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout); if (l_err) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm", get_huid(l_nvdimm)); + + // Disarming all dimms due to error + nvdimmDisarm(i_nvdimmTargetList); + + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } + // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); - errlCommit( l_err, NVDIMM_COMP_ID ); + + // Dump Traces for error logs + nvdimmTraceRegs( l_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + + errlCommit(l_err, NVDIMM_COMP_ID); o_arm_successful = false; - continue; + break; } // After arming the trigger, erase the image to prevent the possible @@ -252,7 +323,17 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) l_err = nvdimmEraseNF(l_nvdimm); if (l_err) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to erase post arm", get_huid(l_nvdimm)); + + // Disarming all dimms due to error + nvdimmDisarm(i_nvdimmTargetList); + + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } + // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module @@ -271,8 +352,7 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); } - - continue; + break; } // Arm successful, update armed status @@ -284,6 +364,78 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); } + + // Enable event notification + l_err = nvdimmWriteReg(l_nvdimm, SET_EVENT_NOTIFICATION_CMD, PERSISTENCY_NOTIFICATION); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] error initiating erase!!", + TARGETING::get_huid(l_nvdimm)); + errlCommit(l_err, NVDIMM_COMP_ID); + } + + // Check notification status and errors + l_err = nvdimmReadReg(l_nvdimm, SET_EVENT_NOTIFICATION_STATUS, l_data); + if (l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else if (((l_data & SET_EVENT_NOTIFICATION_ERROR) == SET_EVENT_NOTIFICATION_ERROR) || ((l_data & PERSISTENCY_ENABLED) != PERSISTENCY_ENABLED)) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set event notification", get_huid(l_nvdimm)); + + // Set NVDIMM Status flag to Restored, as error detected but data might persist + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERROR); + + /*@ + *@errortype + *@reasoncode NVDIMM_SET_EVENT_NOTIFICATION_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_SET_EVENT_NOTIFICATION + *@userdata1[0:31] Target Huid + *@userdata2 + *@devdesc NVDIMM threw an error or failed to set event + * notifications during arming + *@custdesc NVDIMM failed to enable event notificaitons + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_SET_EVENT_NOTIFICATION, + NVDIMM_SET_EVENT_NOTIFICATION_ERROR, + TARGETING::get_huid(l_nvdimm), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Callout, deconfig and gard the dimm + l_err->addPartCallout( l_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); + + + // Read relevant regs for trace data + nvdimmTraceRegs(l_nvdimm, l_RegInfo); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + + // Re-check health status registers + l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_POST_ARM, l_continue ); + + // Check for health status failure + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed final health status check", get_huid(l_nvdimm)); + + errlCommit( l_err, NVDIMM_COMP_ID ); + o_arm_successful = false; + break; + } + } TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArm() returning %d", @@ -318,7 +470,6 @@ bool nvdimmDisarm(TargetHandleList &i_nvdimmTargetList) // salvage the data if (l_err) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module @@ -361,8 +512,7 @@ bool nvdimmInErrorState(Target *i_nvdimm) // Just checking bit 1 for now, need to investigate these // Should be checking NVDIMM_ARMED instead - //if ((l_statusFlag & NSTD_ERR) == 0) - if ((l_statusFlag & NSTD_ERR_NOPRSV) == 0) + if ((l_statusFlag & NSTD_VAL_ERASED) == 0) { l_ret = false; }