Force reboot without visible errors for core wakeup failure

The intermittent core wakeup failure continues to plague us with no solution in sight. Since the error is extremely rare (less than 1% of boots) we have decided to force a manual reboot and not log any visible errors to the customer. Change-Id: Ic30f6330431bd2c8ce75075befc2c36d278d8152 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/71319 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
open-power · Feb 7, 2019 · 39cba20 · 39cba20
1 parent 8a977a1
commit 39cba20
Showing 1 changed file with 34 additions and 9 deletions.
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
@@ -51,6 +51,7 @@
 #include    <scom/scomif.H>
 #include    <errl/errludprintk.H>
 #include    <intr/intr_reasoncodes.H>
+#include    <initservice/istepdispatcherif.H>
 
 using   namespace   ERRORLOG;
 using   namespace   TARGETING;
@@ -84,6 +85,9 @@ void* call_host_activate_slave_cores (void *io_pArgs)
     assert( sys != NULL );
     uint32_t l_numCores = 0;
 
+    // keep track of which cores started
+    TargetHandleList l_startedCores;
+
     for(TargetHandleList::const_iterator
         l_core = l_cores.begin();
         l_core != l_cores.end();
@@ -233,7 +237,8 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                 // Add interesting ISTEP traces
                 l_errl->collectTrace(ISTEP_COMP_NAME,256);
 
-                l_stepError.addErrorDetails( l_errl );
+                // Choosing to ignore this intermittent error
+                l_errl->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL);
                 errlCommit( l_errl, HWPF_COMP_ID );
 
                 // Remember that we failed so we can gard the core if it
@@ -242,6 +247,14 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                 (*l_core)->
                   setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
 
+#ifdef CONFIG_BMC_IPMI
+                // Initiate a graceful power cycle
+                CONSOLE::displayf(ISTEP_COMP_NAME, "System Rebooting To Retry Recoverable Error");
+                CONSOLE::flush();
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,"call_host_activate_slave_cores: requesting power cycle");
+                INITSERVICE::requestReboot();
+#endif
+
                 break;
             }
             // Create unrecoverable error log if this is a repeat
@@ -306,17 +319,29 @@ void* call_host_activate_slave_cores (void *io_pArgs)
             // Zero out the counter if we passed 
             else if( l_prevFail > 0 )
             {
-                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
-                           "call_host_activate_slave_cores: "
-                           "Resetting failure count for core %.8X",
-                           TARGETING::get_huid(*l_core) );
-                l_prevFail = 0;
-                (*l_core)->
-                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+                // Add to the list of passing cores so we can
+                //  clear ATTR_PREVIOUS_WAKEUP_FAIL later
+                l_startedCores.push_back(*l_core);
             }
         }
     }
-    // @@@@@    END CUSTOM BLOCK:   @@@@@
+
+    // Clear out the wakeup_fail indicators only after every core has passed.
+    //  Doing this outside the loop helps mitigate the (unlikely) case where
+    //  a failure bounces between different cores on several consecutive boots.
+    for(TargetHandleList::const_iterator
+        l_core = l_startedCores.begin();
+        l_core != l_startedCores.end();
+        ++l_core)
+    {
+        TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                   "call_host_activate_slave_cores: "
+                   "Resetting failure count for core %.8X",
+                   TARGETING::get_huid(*l_core) );
+        ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail = 0;
+        (*l_core)->
+          setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+    }
 
 #if defined(CONFIG_IPLTIME_CHECKSTOP_ANALYSIS) && !defined(__HOSTBOOT_RUNTIME)
     if( l_stepError.isNull() )