From 0ba9572f9fe0fdba05450150d6491f3f4e26a4dc Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 2 Jun 2016 17:36:11 -0700 Subject: [PATCH] Cleanup the forced termination a bit by restoring the delay before issuing the sigkill, and eliminating the large time loss spent checking if the proc died. The latter is responsible for a large number of test timeouts in MTT Update alps component --- orte/mca/odls/alps/odls_alps_module.c | 66 +-------------- orte/mca/odls/base/odls_base_default_fns.c | 89 +++++++++++++++++---- orte/mca/odls/base/odls_private.h | 3 +- orte/mca/odls/default/odls_default_module.c | 67 +--------------- 4 files changed, 76 insertions(+), 149 deletions(-) diff --git a/orte/mca/odls/alps/odls_alps_module.c b/orte/mca/odls/alps/odls_alps_module.c index 1f608eb2656..2f6e1f6d819 100644 --- a/orte/mca/odls/alps/odls_alps_module.c +++ b/orte/mca/odls/alps/odls_alps_module.c @@ -164,70 +164,6 @@ orte_odls_base_module_t orte_odls_alps_module = { }; -static bool odls_alps_child_died(orte_proc_t *child) -{ - time_t end; - pid_t ret; - - /* Because of rounding in time (which returns whole seconds) we - * have to add 1 to our wait number: this means that we wait - * somewhere between (target) and (target)+1 seconds. Otherwise, - * the default 1s actually means 'somwhere between 0 and 1s'. */ - end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1; - do { - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:alps:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid), - orte_odls_globals.timeout_before_sigkill + 1)); - ret = waitpid(child->pid, &child->exit_code, WNOHANG); - if (child->pid == ret) { - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:alps:WAITPID INDICATES PROC %d IS DEAD", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); - /* It died -- return success */ - return true; - } else if (0 == ret) { - /* with NOHANG specified, if a process has already exited - * while waitpid was registered, then waitpid returns 0 - * as there is no error - this is a race condition problem - * that occasionally causes us to incorrectly report a proc - * as refusing to die. Unfortunately, errno may not be reset - * by waitpid in this case, so we cannot check it. - * - * (note the previous fix to this, to return 'process dead' - * here, fixes the race condition at the cost of reporting - * all live processes have immediately died! Better to - * occasionally report a dead process as still living - - * which will occasionally trip the timeout for cases that - * are right on the edge.) - */ - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:alps:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); - /* Do nothing, process still alive */ - } else if (-1 == ret && ECHILD == errno) { - /* The pid no longer exists, so we'll call this "good - enough for government work" */ - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:alps:WAITPID INDICATES PID %d NO LONGER EXISTS", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); - return true; - } - - /* Bogus delay for 1 msec - let's actually give the CPU some time - * to quit the other process (sched_yield() -- even if we have it - * -- changed behavior in 2.6.3x Linux flavors to be undesirable) - * Don't use select on a bogus file descriptor here as it has proven - * unreliable and sometimes immediately returns - we really, really - * -do- want to wait a bit! - */ - usleep(1000); - } while (time(NULL) < end); - - /* The child didn't die, so return false */ - return false; -} - static int odls_alps_kill_local(pid_t pid, int signum) { pid_t pgrp; @@ -264,7 +200,7 @@ int orte_odls_alps_kill_local_procs(opal_pointer_array_t *procs) int rc; if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, - odls_alps_kill_local, odls_alps_child_died))) { + odls_alps_kill_local))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 4fa97b711f6..44306221746 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1366,9 +1366,57 @@ void odls_base_default_wait_local_proc(orte_proc_t *proc, void* cbdata) ORTE_ACTIVATE_PROC_STATE(&proc->name, state); } +typedef struct { + opal_object_t super; + orte_proc_t *child; + orte_odls_base_kill_local_fn_t kill_local; +} orte_odls_quick_caddy_t; +static void qcdcon(orte_odls_quick_caddy_t *p) +{ + p->child = NULL; + p->kill_local = NULL; +} +static void qcddes(orte_odls_quick_caddy_t *p) +{ + if (NULL != p->child) { + OBJ_RELEASE(p->child); + } +} +OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t, + opal_object_t, + qcdcon, qcddes); + +static void send_kill(int sd, short args, void *cbdata) +{ + orte_timer_t *tm = (orte_timer_t*)cbdata; + orte_odls_quick_caddy_t *cd = (orte_odls_quick_caddy_t*)tm->payload; + + OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, + "%s SENDING FORCE SIGKILL TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&cd->child->name))); + + cd->kill_local(cd->child->pid, SIGKILL); + /* indicate the waitpid fired as this is effectively what + * has happened + */ + ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID); + cd->child->pid = 0; + + /* ensure the child's session directory is cleaned up */ + orte_session_dir_finalize(&cd->child->name); + /* check for everything complete - this will remove + * the child object from our local list + */ + if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) && + ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) { + ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state); + } + OBJ_RELEASE(cd); +} + int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, - orte_odls_base_kill_local_fn_t kill_local, - orte_odls_base_child_died_fn_t child_died) + orte_odls_base_kill_local_fn_t kill_local) { orte_proc_t *child; opal_list_t procs_killed; @@ -1376,6 +1424,7 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, int i, j; opal_pointer_array_t procarray, *procptr; bool do_cleanup; + orte_odls_quick_caddy_t *cd; OBJ_CONSTRUCT(&procs_killed, opal_list_t); @@ -1492,22 +1541,30 @@ int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, */ orte_wait_cb_cancel(child); - /* Use SIGKILL just to make sure things are dead - * This fixes an issue that, if the application is masking - * SIGTERM, then the child_died() may return 'true' even - * though waipid returns with 0. It does this to avoid a - * race condition, per documentation in odls_default_module.c. - */ + /* First send a SIGCONT in case the process is in stopped state. + If it is in a stopped state and we do not first change it to + running, then SIGTERM will not get delivered. Ignore return + value. */ OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s SENDING FORCE SIGKILL TO %s pid %lu", + "%s SENDING SIGCONT TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&child->name), (unsigned long)child->pid)); - kill_local(child->pid, SIGKILL); - /* indicate the waitpid fired as this is effectively what - * has happened - */ - ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID); - child->pid = 0; + ORTE_NAME_PRINT(&child->name))); + kill_local(child->pid, SIGCONT); + + /* Send a sigterm to the process before sigkill to be nice */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, + "%s SENDING SIGTERM TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child->name))); + kill_local(child->pid, SIGTERM); + + cd = OBJ_NEW(orte_odls_quick_caddy_t); + OBJ_RETAIN(child); + cd->child = child; + cd->kill_local = kill_local; + ORTE_DETECT_TIMEOUT(1, orte_odls_globals.timeout_before_sigkill, + 10000000, send_kill, cd); + continue; CLEANUP: /* ensure the child's session directory is cleaned up */ diff --git a/orte/mca/odls/base/odls_private.h b/orte/mca/odls/base/odls_private.h index 48fa5133c80..2a26df98ccc 100644 --- a/orte/mca/odls/base/odls_private.h +++ b/orte/mca/odls/base/odls_private.h @@ -121,8 +121,7 @@ typedef bool (*orte_odls_base_child_died_fn_t)(orte_proc_t *child); ORTE_DECLSPEC int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs, - orte_odls_base_kill_local_fn_t kill_local, - orte_odls_base_child_died_fn_t child_died); + orte_odls_base_kill_local_fn_t kill_local); ORTE_DECLSPEC int orte_odls_base_default_restart_proc(orte_proc_t *child, orte_odls_base_fork_local_proc_fn_t fork_local); diff --git a/orte/mca/odls/default/odls_default_module.c b/orte/mca/odls/default/odls_default_module.c index 232f3f11c6c..605b7908457 100644 --- a/orte/mca/odls/default/odls_default_module.c +++ b/orte/mca/odls/default/odls_default_module.c @@ -164,71 +164,6 @@ orte_odls_base_module_t orte_odls_default_module = { }; -static bool odls_default_child_died(orte_proc_t *child) -{ - time_t end; - pid_t ret; - - /* Because of rounding in time (which returns whole seconds) we - * have to add 1 to our wait number: this means that we wait - * somewhere between (target) and (target)+1 seconds. Otherwise, - * the default 1s actually means 'somwhere between 0 and 1s'. */ - end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1; - do { - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:default:WAITPID CHECKING PID %d WITH TIMEOUT %d SECONDS", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid), - orte_odls_globals.timeout_before_sigkill + 1)); - ret = waitpid(child->pid, &child->exit_code, WNOHANG); - if (child->pid == ret) { - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:default:WAITPID INDICATES PROC %d IS DEAD", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); - /* It died -- return success */ - return true; - } else if (0 == ret) { - /* with NOHANG specified, if a process has already exited - * while waitpid was registered, then waitpid returns 0 - * as there is no error - this is a race condition problem - * that occasionally causes us to incorrectly report a proc - * as refusing to die. Unfortunately, errno may not be reset - * by waitpid in this case, so we cannot check it. - * - * (note the previous fix to this, to return 'process dead' - * here, fixes the race condition at the cost of reporting - * all live processes have immediately died! Better to - * occasionally report a dead process as still living - - * which will occasionally trip the timeout for cases that - * are right on the edge.) - */ - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); - /* Do nothing, process still alive */ - } else if (-1 == ret && ECHILD == errno) { - /* The pid no longer exists, so we'll call this "good - enough for government work" */ - OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output, - "%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); - return true; - } - - /* Bogus delay for 1 msec - let's actually give the CPU some time - * to quit the other process (sched_yield() -- even if we have it - * -- changed behavior in 2.6.3x Linux flavors to be undesirable) - * Don't use select on a bogus file descriptor here as it has proven - * unreliable and sometimes immediately returns - we really, really - * -do- want to wait a bit! - */ - usleep(1000); - } while (time(NULL) < end); - - /* The child didn't die, so return false */ - return false; -} - - /* deliver a signal to a specified pid. */ static int odls_default_kill_local(pid_t pid, int signum) { @@ -251,7 +186,7 @@ int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs) int rc; if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, - odls_default_kill_local, odls_default_child_died))) { + odls_default_kill_local))) { ORTE_ERROR_LOG(rc); return rc; }