From 0ba9572f9fe0fdba05450150d6491f3f4e26a4dc Mon Sep 17 00:00:00 2001
From: Ralph Castain <rhc@open-mpi.org>
Date: Thu, 2 Jun 2016 17:36:11 -0700
Subject: [PATCH] Cleanup the forced termination a bit by restoring the delay
 before issuing the sigkill, and eliminating the large time loss spent
 checking if the proc died. The latter is responsible for a large number of
 test timeouts in MTT

Update alps component
---
 orte/mca/odls/alps/odls_alps_module.c       | 66 +--------------
 orte/mca/odls/base/odls_base_default_fns.c  | 89 +++++++++++++++++----
 orte/mca/odls/base/odls_private.h           |  3 +-
 orte/mca/odls/default/odls_default_module.c | 67 +---------------
 4 files changed, 76 insertions(+), 149 deletions(-)

diff --git a/orte/mca/odls/alps/odls_alps_module.c b/orte/mca/odls/alps/odls_alps_module.c
index 1f608eb2656..2f6e1f6d819 100644
--- a/orte/mca/odls/alps/odls_alps_module.c
+++ b/orte/mca/odls/alps/odls_alps_module.c
@@ -164,70 +164,6 @@ orte_odls_base_module_t orte_odls_alps_module = {
 };
 
 
-static bool odls_alps_child_died(orte_proc_t *child)
-{
-    time_t end;
-    pid_t ret;
-
-    /* Because of rounding in time (which returns whole seconds) we
-     * have to add 1 to our wait number: this means that we wait
-     * somewhere between (target) and (target)+1 seconds.  Otherwise,
-     * the default 1s actually means 'somwhere between 0 and 1s'. */
-    end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
-    do {
-        OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                             "%s odls:alps:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
-                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
-                             orte_odls_globals.timeout_before_sigkill + 1));
-        ret = waitpid(child->pid, &child->exit_code, WNOHANG);
-        if (child->pid == ret) {
-            OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                                 "%s odls:alps:WAITPID INDICATES PROC %d IS DEAD",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
-            /* It died -- return success */
-            return true;
-        } else if (0 == ret) {
-            /* with NOHANG specified, if a process has already exited
-             * while waitpid was registered, then waitpid returns 0
-             * as there is no error - this is a race condition problem
-             * that occasionally causes us to incorrectly report a proc
-             * as refusing to die. Unfortunately, errno may not be reset
-             * by waitpid in this case, so we cannot check it.
-	     *
-	     * (note the previous fix to this, to return 'process dead'
-	     * here, fixes the race condition at the cost of reporting
-	     * all live processes have immediately died!  Better to
-	     * occasionally report a dead process as still living -
-	     * which will occasionally trip the timeout for cases that
-	     * are right on the edge.)
-             */
-            OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                                 "%s odls:alps:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
-	    /* Do nothing, process still alive */
-        } else if (-1 == ret && ECHILD == errno) {
-            /* The pid no longer exists, so we'll call this "good
-               enough for government work" */
-            OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                                 "%s odls:alps:WAITPID INDICATES PID %d NO LONGER EXISTS",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
-            return true;
-        }
-
-        /* Bogus delay for 1 msec - let's actually give the CPU some time
-         * to quit the other process (sched_yield() -- even if we have it
-         * -- changed behavior in 2.6.3x Linux flavors to be undesirable)
-         * Don't use select on a bogus file descriptor here as it has proven
-         * unreliable and sometimes immediately returns - we really, really
-         * -do- want to wait a bit!
-         */
-        usleep(1000);
-    } while (time(NULL) < end);
-
-    /* The child didn't die, so return false */
-    return false;
-}
-
 static int odls_alps_kill_local(pid_t pid, int signum)
 {
     pid_t pgrp;
@@ -264,7 +200,7 @@ int orte_odls_alps_kill_local_procs(opal_pointer_array_t *procs)
     int rc;
 
     if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
-                                    odls_alps_kill_local, odls_alps_child_died))) {
+                                                odls_alps_kill_local))) {
         ORTE_ERROR_LOG(rc);
         return rc;
     }
diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c
index 4fa97b711f6..44306221746 100644
--- a/orte/mca/odls/base/odls_base_default_fns.c
+++ b/orte/mca/odls/base/odls_base_default_fns.c
@@ -1366,9 +1366,57 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata)
     ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
 }
 
+typedef struct {
+    opal_object_t super;
+    orte_proc_t *child;
+    orte_odls_base_kill_local_fn_t kill_local;
+} orte_odls_quick_caddy_t;
+static void qcdcon(orte_odls_quick_caddy_t *p)
+{
+    p->child = NULL;
+    p->kill_local = NULL;
+}
+static void qcddes(orte_odls_quick_caddy_t *p)
+{
+    if (NULL != p->child) {
+        OBJ_RELEASE(p->child);
+    }
+}
+OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
+                   opal_object_t,
+                   qcdcon, qcddes);
+
+static void send_kill(int sd, short args, void *cbdata)
+{
+    orte_timer_t *tm = (orte_timer_t*)cbdata;
+    orte_odls_quick_caddy_t *cd = (orte_odls_quick_caddy_t*)tm->payload;
+
+    OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
+                         "%s SENDING FORCE SIGKILL TO %s",
+                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                         ORTE_NAME_PRINT(&cd->child->name)));
+
+    cd->kill_local(cd->child->pid, SIGKILL);
+    /* indicate the waitpid fired as this is effectively what
+     * has happened
+     */
+    ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
+    cd->child->pid = 0;
+
+    /* ensure the child's session directory is cleaned up */
+    orte_session_dir_finalize(&cd->child->name);
+    /* check for everything complete - this will remove
+     * the child object from our local list
+     */
+    if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
+        ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
+        ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
+    }
+    OBJ_RELEASE(cd);
+}
+
 int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
-                                            orte_odls_base_kill_local_fn_t kill_local,
-                                            orte_odls_base_child_died_fn_t child_died)
+                                            orte_odls_base_kill_local_fn_t kill_local)
 {
     orte_proc_t *child;
     opal_list_t procs_killed;
@@ -1376,6 +1424,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
     int i, j;
     opal_pointer_array_t procarray, *procptr;
     bool do_cleanup;
+    orte_odls_quick_caddy_t *cd;
 
     OBJ_CONSTRUCT(&procs_killed, opal_list_t);
 
@@ -1492,22 +1541,30 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
              */
             orte_wait_cb_cancel(child);
 
-            /* Use SIGKILL just to make sure things are dead
-             * This fixes an issue that, if the application is masking
-             * SIGTERM, then the child_died() may return 'true' even
-             * though waipid returns with 0. It does this to avoid a
-             * race condition, per documentation in odls_default_module.c.
-             */
+            /* First send a SIGCONT in case the process is in stopped state.
+               If it is in a stopped state and we do not first change it to
+               running, then SIGTERM will not get delivered.  Ignore return
+               value. */
             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
-                                 "%s SENDING FORCE SIGKILL TO %s pid %lu",
+                                 "%s SENDING SIGCONT TO %s",
                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
-                                 ORTE_NAME_PRINT(&child->name), (unsigned long)child->pid));
-            kill_local(child->pid, SIGKILL);
-            /* indicate the waitpid fired as this is effectively what
-             * has happened
-             */
-            ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
-            child->pid = 0;
+                                 ORTE_NAME_PRINT(&child->name)));
+            kill_local(child->pid, SIGCONT);
+
+            /* Send a sigterm to the process before sigkill to be nice */
+            OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
+                                 "%s SENDING SIGTERM TO %s",
+                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+                                 ORTE_NAME_PRINT(&child->name)));
+            kill_local(child->pid, SIGTERM);
+
+            cd = OBJ_NEW(orte_odls_quick_caddy_t);
+            OBJ_RETAIN(child);
+            cd->child = child;
+            cd->kill_local = kill_local;
+            ORTE_DETECT_TIMEOUT(1, orte_odls_globals.timeout_before_sigkill,
+                                10000000, send_kill, cd);
+            continue;
 
         CLEANUP:
             /* ensure the child's session directory is cleaned up */
diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h
index 48fa5133c80..2a26df98ccc 100644
--- a/orte/mca/odls/base/odls_private.h
+++ b/orte/mca/odls/base/odls_private.h
@@ -121,8 +121,7 @@ typedef bool (*orte_odls_base_child_died_fn_t)(orte_proc_t *child);
 
 ORTE_DECLSPEC int
 orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
-                                        orte_odls_base_kill_local_fn_t kill_local,
-                                        orte_odls_base_child_died_fn_t child_died);
+                                        orte_odls_base_kill_local_fn_t kill_local);
 
 ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_proc_t *child,
                                                       orte_odls_base_fork_local_proc_fn_t fork_local);
diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c
index 232f3f11c6c..605b7908457 100644
--- a/orte/mca/odls/default/odls_default_module.c
+++ b/orte/mca/odls/default/odls_default_module.c
@@ -164,71 +164,6 @@ orte_odls_base_module_t orte_odls_default_module = {
 };
 
 
-static bool odls_default_child_died(orte_proc_t *child)
-{
-    time_t end;
-    pid_t ret;
-
-    /* Because of rounding in time (which returns whole seconds) we
-     * have to add 1 to our wait number: this means that we wait
-     * somewhere between (target) and (target)+1 seconds.  Otherwise,
-     * the default 1s actually means 'somwhere between 0 and 1s'. */
-    end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1;
-    do {
-        OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                             "%s odls:default:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS",
-                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid),
-                             orte_odls_globals.timeout_before_sigkill + 1));
-        ret = waitpid(child->pid, &child->exit_code, WNOHANG);
-        if (child->pid == ret) {
-            OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                                 "%s odls:default:WAITPID INDICATES PROC %d IS DEAD",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
-            /* It died -- return success */
-            return true;
-        } else if (0 == ret) {
-            /* with NOHANG specified, if a process has already exited
-             * while waitpid was registered, then waitpid returns 0
-             * as there is no error - this is a race condition problem
-             * that occasionally causes us to incorrectly report a proc
-             * as refusing to die. Unfortunately, errno may not be reset
-             * by waitpid in this case, so we cannot check it.
-             *
-             * (note the previous fix to this, to return 'process dead'
-             * here, fixes the race condition at the cost of reporting
-             * all live processes have immediately died!  Better to
-             * occasionally report a dead process as still living -
-             * which will occasionally trip the timeout for cases that
-             * are right on the edge.)
-             */
-            OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                                 "%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
-            /* Do nothing, process still alive */
-        } else if (-1 == ret && ECHILD == errno) {
-            /* The pid no longer exists, so we'll call this "good
-               enough for government work" */
-            OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
-                                 "%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS",
-                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
-            return true;
-        }
-
-        /* Bogus delay for 1 msec - let's actually give the CPU some time
-         * to quit the other process (sched_yield() -- even if we have it
-         * -- changed behavior in 2.6.3x Linux flavors to be undesirable)
-         * Don't use select on a bogus file descriptor here as it has proven
-         * unreliable and sometimes immediately returns - we really, really
-         * -do- want to wait a bit!
-         */
-        usleep(1000);
-    } while (time(NULL) < end);
-
-    /* The child didn't die, so return false */
-    return false;
-}
-
-
 /* deliver a signal to a specified pid. */
 static int odls_default_kill_local(pid_t pid, int signum)
 {
@@ -251,7 +186,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs)
     int rc;
 
     if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
-                                    odls_default_kill_local, odls_default_child_died))) {
+                                            odls_default_kill_local))) {
         ORTE_ERROR_LOG(rc);
         return rc;
     }