open-mpi · bwbarrett · Jul 11, 2017 · Jun 27, 2017 · bosilca · Jun 27, 2017
diff --git a/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c b/orte/mca/errmgr/default_hnp/errmgr_default_hnp.c
@@ -443,27 +443,32 @@ static void proc_errors(int fd, short args, void *cbdata)
                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
         /* record the first one to fail */
         if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
-            /* output an error message so the user knows what happened */
-            orte_show_help("help-errmgr-base.txt", "node-died", true,
-                           ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                           orte_process_info.nodename,
-                           ORTE_NAME_PRINT(proc),
-                           pptr->node->name);
             /* mark the daemon job as failed */
             jdata->state = ORTE_JOB_STATE_COMM_FAILED;
             /* point to the lowest rank to cause the problem */
             orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
             /* retain the object so it doesn't get free'd */
             OBJ_RETAIN(pptr);
             ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
-            /* update our exit code */
-            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
-            /* just in case the exit code hadn't been set, do it here - this
-             * won't override any reported exit code */
-            ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
+            if (!orte_enable_recovery) {
+                /* output an error message so the user knows what happened */
+                orte_show_help("help-errmgr-base.txt", "node-died", true,
+                               ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                               orte_process_info.nodename,
+                               ORTE_NAME_PRINT(proc),
+                               pptr->node->name);
+                /* update our exit code */
+                ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
+                /* just in case the exit code hadn't been set, do it here - this
+                 * won't override any reported exit code */
+                ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
+            }
+        }
+        /* if recovery is enabled, then we are done - otherwise,
+         * abort the system */
+        if (!orte_enable_recovery) {
+            default_hnp_abort(jdata);
         }
-        /* abort the system */
-        default_hnp_abort(jdata);
         goto cleanup;
     }
 
@@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
   keep_going:
     /* if this is a continuously operating job, then there is nothing more
      * to do - we let the job continue to run */
-    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
+    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
+        ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
         /* always mark the waitpid as having fired */
         ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
         /* if this is a remote proc, we won't hear anything more about it

diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c
@@ -267,8 +267,10 @@ static void launch_daemons(int fd, short args, void *cbdata)
     /* start one orted on each node */
     opal_argv_append(&argc, &argv, "--ntasks-per-node=1");
 
-    /* alert us if any orteds die during startup */
-    opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
+    if (!orte_enable_recovery) {
+        /* kill the job if any orteds die */
+        opal_argv_append(&argc, &argv, "--kill-on-bad-exit");
+    }
 
     /* ensure the orteds are not bound to a single processor,
      * just in case the TaskAffinity option is set by default.

diff --git a/orte/tools/orte-clean/orte-clean.c b/orte/tools/orte-clean/orte-clean.c
@@ -183,7 +183,7 @@ main(int argc, char *argv[])
     free(legacy);
 
     /* and finally get rid of any lingering pmix-related artifacts */
-    asprintf(&legacy, "rm -f %s/pmix*", orte_process_info.tmpdir_base);
+    asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base);
     system(legacy);
     free(legacy);