Skip to content

Commit

Permalink
Clear ECC sections marked "clearOnEccErr" on error
Browse files Browse the repository at this point in the history
  - Add the capability for Hostboot to recover (with reboot)
    when it consumes an ECC error
  - PNOR layout needs to be updated to flag the recoverable
    sections (generally cached or throw away data like *VPD
    HBEL, and GUARD partitions)
  - Upon bad ECC detection, Hostboot will check partition
    flag and if set, it will clear and write good ECC to PNOR.
    It will then throw the normal error and terminate, waiting
    for the BMC to issue a reboot

Change-Id: Ie4f4c0637d3962e9d4871e84a0bda8c256a74440
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44608
Reviewed-by: Stephen M. Cprek <smcprek@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Nicholas E. Bofferding <bofferdn@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
sannerd authored and dcrowell77 committed Sep 15, 2017
1 parent 16887e0 commit 9acfce9
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 3 deletions.
2 changes: 2 additions & 0 deletions src/build/buildpnor/PnorUtils.pm
Expand Up @@ -156,6 +156,7 @@ sub loadPnorLayout
my $sha512perEC = (exists $sectionEl->{sha512perEC} ? "yes" : "no");
my $preserved = (exists $sectionEl->{preserved} ? "yes" : "no");
my $reprovision = (exists $sectionEl->{reprovision} ? "yes" : "no");
my $clearOnEccErr = (exists $sectionEl->{clearOnEccErr} ? "yes" : "no");
my $readOnly = (exists $sectionEl->{readOnly} ? "yes" : "no");
if (($i_testRun == 0) && ($sectionEl->{testonly}[0] eq "yes"))
{
Expand All @@ -182,6 +183,7 @@ sub loadPnorLayout
$$i_pnorLayoutRef{sections}{$physicalOffset}{sha512perEC} = $sha512perEC;
$$i_pnorLayoutRef{sections}{$physicalOffset}{preserved} = $preserved;
$$i_pnorLayoutRef{sections}{$physicalOffset}{reprovision} = $reprovision;
$$i_pnorLayoutRef{sections}{$physicalOffset}{clearOnEccErr} = $clearOnEccErr;
$$i_pnorLayoutRef{sections}{$physicalOffset}{readOnly} = $readOnly;

#store the physical offsets of each section in a hash, so, it is easy
Expand Down
4 changes: 4 additions & 0 deletions src/build/buildpnor/buildpnor.pl
Expand Up @@ -245,6 +245,10 @@ sub addUserData
{
$miscFlags |= 0x08;
}
if( ($i_sectionHash{$i_key}{clearOnEccErr} eq "yes") )
{
$miscFlags |= 0x04;
}

#First User Data Word
#[1:chip][1:compressType][2:dataInteg]
Expand Down
10 changes: 10 additions & 0 deletions src/include/usr/errl/errlmanager.H
Expand Up @@ -335,6 +335,16 @@ private:
*/
static void * startup ( void* i_self );

/**
* @brief Performs setup of the PNOR info. Done as a separate
* thread to handle error conditions (ECC on HBEL) where
* task gets killed.
*
* @param[in/out]
* @return NONE
*/
static void * pnorSetupThread ( void* i_self );

/**
* @brief Message handler for process Hostboot error log message
* and send it to FSP.
Expand Down
1 change: 1 addition & 0 deletions src/include/usr/pnor/pnor_const.H
Expand Up @@ -107,6 +107,7 @@ struct SectionInfo_t
bool reprovision; /**< Erase this section during a reprovision */
bool Volatile; /**< Section loses contents on non HB reboots */
bool secure; /**< Indicates if a section is secure */
bool clearOnEccErr; /**< Indicates on ECC errors, clear and reboot*/
#ifdef CONFIG_SECUREBOOT
size_t secureProtectedPayloadSize; /**< Cache the secure payload size so
that the secure container only
Expand Down
34 changes: 33 additions & 1 deletion src/usr/errl/errlmanager.C
Expand Up @@ -231,6 +231,20 @@ void * ErrlManager::startup ( void* i_self )
return NULL;
}

///////////////////////////////////////////////////////////////////////////////
// ErrlManager::pnorSetupThread()
///////////////////////////////////////////////////////////////////////////////
void * ErrlManager::pnorSetupThread ( void* i_self )
{
TRACFCOMP( g_trac_errl, ENTER_MRK "ErrlManager::pnorSetupThread..." );

//Start a thread to deal with PNOR setup
reinterpret_cast<ErrlManager *>(i_self)->setupPnorInfo();

TRACFCOMP( g_trac_errl, EXIT_MRK "ErrlManager::pnorSetupThread" );
return nullptr;
}


///////////////////////////////////////////////////////////////////////////////
// ErrlManager::errlogMsgHndlr()
Expand All @@ -250,7 +264,25 @@ void ErrlManager::errlogMsgHndlr ()
case ERRLOG_ACCESS_PNOR_TYPE:
{
// PNOR is up and running now.
setupPnorInfo();
// This can fail if there is bad ECC in HBEL (which is
// somewhat common on power faults). Because of this,
// trigger this as separate task so message that kills
//task on bad ECC doesn't bring down the whole daemon
auto l_tid = task_create(ErrlManager::pnorSetupThread,
this);

// status of the task ( OK or Crashed )
int l_childsts = 0;
auto l_tidretrc = task_wait_tid( l_tid, &l_childsts, 0);

if ((static_cast<int16_t>(l_tidretrc) < 0 ) ||
(l_childsts != TASK_STATUS_EXITED_CLEAN ))
{
TRACFCOMP(g_trac_errl, ERR_MRK "Failed to setup PNOR; l_tidretrc=0x%x,"
" l_childsts=0x%x", l_tidretrc, l_childsts);
//Set iv_pnorAddr to nullptr to prevent writes
iv_pnorAddr = nullptr;
}

//We are done with the msg
msg_free(theMsg);
Expand Down
3 changes: 2 additions & 1 deletion src/usr/pnor/common/ffs_hb.H
Expand Up @@ -69,8 +69,9 @@ enum
FFS_MISC_PSEUDO = 0x20, /**< Pseudo partition */
FFS_MISC_REPROVISION = 0x10, /**< Erased during reprovision */
FFS_MISC_VOLATILE = 0x08, /**< Loses contents based on boot types*/
FFS_MISC_CLR_ECC_ERR = 0x04, /**< Clear and reboot on ECC err */
FFS_MISC_GOLDEN = 0x01, /**< Golden side of PNOR */
FFS_MISC_UNUSED = 0x06, /**< Unused MISC Flags */
FFS_MISC_UNUSED = 0x02, /**< Unused MISC Flags */
};

/**
Expand Down
45 changes: 45 additions & 0 deletions src/usr/pnor/pnorrp.C
Expand Up @@ -649,6 +649,8 @@ errlHndl_t PnorRP::getSectionInfo( PNOR::SectionId i_section,
!= 0) ? true : false;
o_info.Volatile = ((iv_TOC[id].misc & FFS_MISC_VOLATILE)
!= 0) ? true : false;
o_info.clearOnEccErr = ((iv_TOC[id].misc & FFS_MISC_CLR_ECC_ERR)
!= 0) ? true : false;
}

} while(0);
Expand Down Expand Up @@ -1360,9 +1362,22 @@ errlHndl_t PnorRP::readFromDevice( uint64_t i_offset,
// create an error if we couldn't correct things
if( ecc_stat == PNOR::ECC::UNCORRECTABLE )
{
PNOR::SectionId l_id = computeSectionPhys(i_offset);
TRACFCOMP( g_trac_pnor, "PnorRP::readFromDevice> Uncorrectable ECC error : chip=%d,offset=0x%.X", i_chip, i_offset );
CONSOLE::displayf( NULL, "ECC error in PNOR flash in section offset 0x%.8X\n", i_offset );

//Attempt to find the section and check if we can clear
//it to recover
if ((l_id != PNOR::INVALID_SECTION )
&& ((iv_TOC[l_id].misc & FFS_MISC_CLR_ECC_ERR) != 0))
{
CONSOLE::displayf( nullptr, "Clearing section %s due to ECC error\n",
SectionIdToString(l_id));
clearSection(l_id); //shutting down -- ignore and leak errl

CONSOLE::displayf( nullptr, "Done\n");
}

// Need to shutdown here instead of creating an error log
// because the bad page could be critical to the regular
// error handling path and cause an infinite loop.
Expand Down Expand Up @@ -1589,6 +1604,36 @@ errlHndl_t PnorRP::computeSection( uint64_t i_vaddr,
return errhdl;
}

/**
* @brief Figure out which section a PA belongs to
*/
PNOR::SectionId PnorRP::computeSectionPhys( uint64_t i_offset)
{
PNOR::SectionId o_id = PNOR::INVALID_SECTION;

// loop through all sections to find a matching id
for( PNOR::SectionId id = PNOR::FIRST_SECTION;
id < PNOR::NUM_SECTIONS;
id = static_cast<PNOR::SectionId>(id + 1) )
{
//Need to take ECC into account for the size
uint32_t l_size = iv_TOC[id].size;
if ((iv_TOC[id].integrity & FFS_INTEG_ECC_PROTECT) != 0) //ECC
{
l_size = (l_size / 8) * 9;
}

if( (i_offset >= iv_TOC[id].flashAddr)
&& (i_offset < (iv_TOC[id].flashAddr + l_size)) )
{
o_id = iv_TOC[id].id;
break;
}
}

return o_id;
}

errlHndl_t PnorRP::clearSection(PNOR::SectionId i_section)
{
TRACDCOMP(g_trac_pnor, "PnorRP::clearSection Section id = %d", i_section);
Expand Down
12 changes: 11 additions & 1 deletion src/usr/pnor/pnorrp.H
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2016 */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* [+] Google Inc. */
/* [+] International Business Machines Corp. */
/* */
Expand Down Expand Up @@ -279,6 +279,16 @@ class PnorRP
errlHndl_t computeSection( uint64_t i_vaddr,
PNOR::SectionId& o_id );

/**
* @brief Figure out which section a PA belongs to
*
* @param[in] i_offset Physical offset into PNOR
*
* @return Which section of PNOR, returns
* PNOR::INVALID_SECTION if not mapped
*/
PNOR::SectionId computeSectionPhys( uint64_t i_offset );

/**
* @brief Returns true if the initial startup failed for some reason
* @param[out] Return code
Expand Down

0 comments on commit 9acfce9

Please sign in to comment.