diff --git a/orte/mca/ess/hnp/Makefile.am b/orte/mca/ess/hnp/Makefile.am index 9c4a9979bfc..88a92ed56fc 100644 --- a/orte/mca/ess/hnp/Makefile.am +++ b/orte/mca/ess/hnp/Makefile.am @@ -10,6 +10,9 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Los Alamos National Security, LLC. All rights +# reseved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -17,6 +20,8 @@ # $HEADER$ # +dist_ortedata_DATA = help-ess-hnp.txt + sources = \ ess_hnp.h \ ess_hnp_component.c \ diff --git a/orte/mca/ess/hnp/ess_hnp.h b/orte/mca/ess/hnp/ess_hnp.h index a4b627be532..0c177210ef5 100644 --- a/orte/mca/ess/hnp/ess_hnp.h +++ b/orte/mca/ess/hnp/ess_hnp.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -24,12 +28,19 @@ BEGIN_C_DECLS /* * Module open / close */ -int orte_ess_hnp_component_open(void); -int orte_ess_hnp_component_close(void); -int orte_ess_hnp_component_query(mca_base_module_t **module, int *priority); +typedef struct { + opal_list_item_t super; + char *signame; + int signal; +} ess_hnp_signal_t; +OBJ_CLASS_DECLARATION(ess_hnp_signal_t); +typedef struct { + orte_ess_base_component_t base; + opal_list_t signals; +} orte_ess_hnp_component_t; -ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_hnp_component; +ORTE_MODULE_DECLSPEC extern orte_ess_hnp_component_t mca_ess_hnp_component; END_C_DECLS diff --git a/orte/mca/ess/hnp/ess_hnp_component.c b/orte/mca/ess/hnp/ess_hnp_component.c index 84d95f64637..a52cfb6a5f1 100644 --- a/orte/mca/ess/hnp/ess_hnp_component.c +++ b/orte/mca/ess/hnp/ess_hnp_component.c @@ -10,8 +10,9 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,45 +30,200 @@ #include "orte/constants.h" #include "orte/util/proc_info.h" +#include "orte/util/show_help.h" #include "orte/mca/ess/ess.h" #include "orte/mca/ess/hnp/ess_hnp.h" +#include "orte/runtime/orte_globals.h" extern orte_ess_base_module_t orte_ess_hnp_module; +static int hnp_component_register (void); +static int hnp_component_open(void); +static int hnp_component_close(void); +static int hnp_component_query(mca_base_module_t **module, int *priority); + +struct known_signal { + /** signal number */ + int signal; + /** signal name */ + char *signame; + /** can this signal be forwarded */ + bool can_forward; +}; + +static struct known_signal known_signals[] = { + {SIGTERM, "SIGTERM", false}, + {SIGHUP, "SIGHUP", false}, + {SIGINT, "SIGINT", false}, + {SIGKILL, "SIGKILL", false}, +#ifdef SIGSYS + {SIGSYS, "SIGSYS", true}, +#endif +#ifdef SIGXCPU + {SIGXCPU, "SIGXCPU", true}, +#endif + {SIGXFSZ, "SIGXFSZ", true}, +#ifdef SIGVTALRM + {SIGVTALRM, "SIGVTALRM", true}, +#endif +#ifdef SIGPROF + {SIGPROF, "SIGPROF", true}, +#endif +#ifdef SIGINFO + {SIGINFO, "SIGINFO", true}, +#endif +#ifdef SIGPWR + {SIGPWR, "SIGPWR", true}, +#endif + {0, NULL}, +}; /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it */ -orte_ess_base_component_t mca_ess_hnp_component = { - .base_version = { - ORTE_ESS_BASE_VERSION_3_0_0, - - /* Component name and version */ - .mca_component_name = "hnp", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - - /* Component open and close functions */ - .mca_open_component = orte_ess_hnp_component_open, - .mca_close_component = orte_ess_hnp_component_close, - .mca_query_component = orte_ess_hnp_component_query, - }, - .base_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, +orte_ess_hnp_component_t mca_ess_hnp_component = { + .base = { + .base_version = { + ORTE_ESS_BASE_VERSION_3_0_0, + + /* Component name and version */ + .mca_component_name = "hnp", + MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION), + + /* Component open and close functions */ + .mca_open_component = hnp_component_open, + .mca_close_component = hnp_component_close, + .mca_query_component = hnp_component_query, + .mca_register_component_params = hnp_component_register, + }, + .base_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } + } }; +static char *additional_signals; -int -orte_ess_hnp_component_open(void) +static int hnp_component_register (void) { + additional_signals = NULL; + (void) mca_base_component_var_register (&mca_ess_hnp_component.base.base_version, + "forward_signals", "Comma-delimited list " + "of additional signals (names or integers) to forward to " + "application processes [\"none\" => forward nothing]", MCA_BASE_VAR_TYPE_STRING, + NULL, 0, 0, OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_READONLY, + &additional_signals); + + return ORTE_SUCCESS; +} + +#define ESS_ADDSIGNAL(x, s) \ + do { \ + ess_hnp_signal_t *_sig; \ + _sig = OBJ_NEW(ess_hnp_signal_t); \ + _sig->signal = (x); \ + _sig->signame = strdup((s)); \ + opal_list_append(&mca_ess_hnp_component.signals, &_sig->super); \ + } while(0) + +static int hnp_component_open(void) +{ + int i, sval; + char **signals, *tmp; + ess_hnp_signal_t *sig; + bool ignore, found; + + OBJ_CONSTRUCT(&mca_ess_hnp_component.signals, opal_list_t); + + /* we know that some signals are (nearly) always defined, regardless + * of environment, so add them here */ + ESS_ADDSIGNAL(SIGTSTP, "SIGTSTP"); + ESS_ADDSIGNAL(SIGUSR1, "SIGUSR1"); + ESS_ADDSIGNAL(SIGUSR2, "SIGUSR2"); + ESS_ADDSIGNAL(SIGABRT, "SIGABRT"); + ESS_ADDSIGNAL(SIGALRM, "SIGALRM"); + ESS_ADDSIGNAL(SIGCONT, "SIGCONT"); +#ifdef SIGURG + ESS_ADDSIGNAL(SIGURG, "SIGURG"); +#endif + + /* see if they asked for anything beyond those - note that they may + * have asked for some we already cover, and so we ignore any duplicates */ + if (NULL != additional_signals) { + /* if they told us "none", then dump the list */ + if (0 == strcmp(additional_signals, "none")) { + OPAL_LIST_DESTRUCT(&mca_ess_hnp_component.signals); + /* need to reconstruct it for when we close */ + OBJ_CONSTRUCT(&mca_ess_hnp_component.signals, opal_list_t); + return ORTE_SUCCESS; + } + signals = opal_argv_split(additional_signals, ','); + for (i=0; NULL != signals[i]; i++) { + sval = 0; + if (0 != strncmp(signals[i], "SIG", 3)) { + /* treat it like a number */ + errno = 0; + sval = strtoul(signals[i], &tmp, 10); + if (0 != errno || '\0' != *tmp) { + orte_show_help("help-ess-hnp.txt", "ess-hnp:unknown-signal", + true, signals[i], additional_signals); + opal_argv_free(signals); + return OPAL_ERR_SILENT; + } + } + + /* see if it is one we already covered */ + ignore = false; + OPAL_LIST_FOREACH(sig, &mca_ess_hnp_component.signals, ess_hnp_signal_t) { + if (0 == strcasecmp(signals[i], sig->signame) || sval == sig->signal) { + /* got it - we will ignore */ + ignore = true; + break; + } + } + + if (ignore) { + continue; + } + + /* see if they gave us a signal name */ + found = false; + for (int j = 0 ; known_signals[j].signame ; ++j) { + if (0 == strcasecmp (signals[i], known_signals[j].signame) || sval == known_signals[j].signal) { + if (!known_signals[j].can_forward) { + orte_show_help("help-ess-hnp.txt", "ess-hnp:cannot-forward", + true, known_signals[j].signame, additional_signals); + opal_argv_free(signals); + return OPAL_ERR_SILENT; + } + found = true; + ESS_ADDSIGNAL(known_signals[j].signal, known_signals[j].signame); + break; + } + } + + if (!found) { + if (0 == strncmp(signals[i], "SIG", 3)) { + orte_show_help("help-ess-hnp.txt", "ess-hnp:unknown-signal", + true, signals[i], additional_signals); + opal_argv_free(signals); + return OPAL_ERR_SILENT; + } + + ESS_ADDSIGNAL(sval, signals[i]); + } + } + opal_argv_free (signals); + } + return ORTE_SUCCESS; } -int orte_ess_hnp_component_query(mca_base_module_t **module, int *priority) +static int hnp_component_query(mca_base_module_t **module, int *priority) { /* we are the hnp module - we need to be selected @@ -86,9 +242,22 @@ int orte_ess_hnp_component_query(mca_base_module_t **module, int *priority) } -int -orte_ess_hnp_component_close(void) +static int hnp_component_close(void) { return ORTE_SUCCESS; } +/* instantiate the class */ +static void scon(ess_hnp_signal_t *t) +{ + t->signame = NULL; +} +static void sdes(ess_hnp_signal_t *t) +{ + if (NULL != t->signame) { + free(t->signame); + } +} +OBJ_CLASS_INSTANCE(ess_hnp_signal_t, + opal_list_item_t, + scon, sdes); diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 0fdbbd935a7..bd5e8edcfa5 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,9 +12,11 @@ * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2011-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -110,10 +113,7 @@ static bool forcibly_die=false; static opal_event_t term_handler; static opal_event_t epipe_handler; static int term_pipe[2]; -static opal_event_t sigusr1_handler; -static opal_event_t sigusr2_handler; -static opal_event_t sigtstp_handler; -static opal_event_t sigcont_handler; +static opal_event_t *forward_signals_events = NULL; static void abort_signal_callback(int signal); static void clean_abort(int fd, short flags, void *arg); @@ -142,6 +142,7 @@ static int rte_init(void) uint32_t h; int idx; orte_topology_t *t; + ess_hnp_signal_t *sig; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -184,11 +185,20 @@ static int rte_init(void) signal(SIGINT, abort_signal_callback); signal(SIGHUP, abort_signal_callback); - /** setup callbacks for signals we should foward */ - setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback); - setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback); - setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback); - setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback); + /** setup callbacks for signals we should forward */ + if (0 < (idx = opal_list_get_size(&mca_ess_hnp_component.signals))) { + forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx); + if (NULL == forward_signals_events) { + ret = ORTE_ERR_OUT_OF_RESOURCE; + error = "unable to malloc"; + goto error; + } + idx = 0; + OPAL_LIST_FOREACH(sig, &mca_ess_hnp_component.signals, ess_hnp_signal_t) { + setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback); + ++idx; + } + } signals_set = true; /* get the local topology */ @@ -752,6 +762,8 @@ static int rte_finalize(void) { char *contact_path; char *jobfam_dir; + ess_hnp_signal_t *sig; + unsigned int i; if (signals_set) { /* Remove the epipe handler */ @@ -759,12 +771,13 @@ static int rte_finalize(void) /* remove the term handler */ opal_event_del(&term_handler); /** Remove the USR signal handlers */ - opal_event_signal_del(&sigusr1_handler); - opal_event_signal_del(&sigusr2_handler); - if (orte_forward_job_control) { - opal_event_signal_del(&sigtstp_handler); - opal_event_signal_del(&sigcont_handler); + i = 0; + OPAL_LIST_FOREACH(sig, &mca_ess_hnp_component.signals, ess_hnp_signal_t) { + opal_event_signal_del(forward_signals_events + i); + ++i; } + free (forward_signals_events); + forward_signals_events = NULL; signals_set = false; } diff --git a/orte/mca/ess/hnp/help-ess-hnp.txt b/orte/mca/ess/hnp/help-ess-hnp.txt new file mode 100644 index 00000000000..7bce2ccbb62 --- /dev/null +++ b/orte/mca/ess/hnp/help-ess-hnp.txt @@ -0,0 +1,27 @@ +# -*- text -*- +# +# Copyright (c) 2017 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for the SDS base. +# +[ess-hnp:cannot-forward] +The system does not support trapping and forwarding of the +specified signal: + + signal: %s + param: %s + +Please remove that signal from the ess_hnp_forward_signals MCA parameter. +[ess-hnp:unknown-signal] +The following signal was included in the ess_hnp_forward_signals +MCA parameter: + + signal: %s + param: %s + +This is not a recognized signal value. Please fix or remove it.