diff --git a/config/orte_config_files.m4 b/config/orte_config_files.m4 index 564ce0ca80b..54e90a06c57 100644 --- a/config/orte_config_files.m4 +++ b/config/orte_config_files.m4 @@ -6,7 +6,7 @@ # Corporation. All rights reserved. # Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2015-2016 Intel, Inc. All rights reserved +# Copyright (c) 2015 Intel, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -33,6 +33,7 @@ AC_DEFUN([ORTE_CONFIG_FILES],[ orte/tools/orte-migrate/Makefile orte/tools/orte-info/Makefile orte/tools/orte-server/Makefile + orte/tools/orte-submit/Makefile orte/tools/orte-dvm/Makefile ]) ]) diff --git a/ompi/mca/rte/orte/Makefile.am b/ompi/mca/rte/orte/Makefile.am index 804d66adb52..5458e412931 100644 --- a/ompi/mca/rte/orte/Makefile.am +++ b/ompi/mca/rte/orte/Makefile.am @@ -1,8 +1,7 @@ # # Copyright (c) 2012 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2016 Intel, Inc. All rights reserved. +# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -28,7 +27,7 @@ libmca_rte_orte_la_SOURCES =$(sources) $(headers) libmca_rte_orte_la_LDFLAGS = -module -avoid-version libmca_rte_orte_la_LIBADD = $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la -man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 ompi-dvm.1 +man_pages = mpirun.1 mpiexec.1 ompi-ps.1 ompi-clean.1 ompi-top.1 ompi-server.1 ompi-dvm.1 ompi-submit.1 if WANT_FT man_pages += ompi-checkpoint.1 ompi-restart.1 @@ -45,6 +44,7 @@ install-exec-hook: (cd $(DESTDIR)$(bindir); rm -f ompi-top$(EXEEXT); $(LN_S) orte-top$(EXEEXT) ompi-top$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-server$(EXEEXT); $(LN_S) orte-server$(EXEEXT) ompi-server$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-dvm$(EXEEXT); $(LN_S) orte-dvm$(EXEEXT) ompi-dvm$(EXEEXT)) + (cd $(DESTDIR)$(bindir); rm -f ompi-submit$(EXEEXT); $(LN_S) orte-submit$(EXEEXT) ompi-submit$(EXEEXT)) if WANT_FT (cd $(DESTDIR)$(bindir); rm -f ompi-checkpoint$(EXEEXT); $(LN_S) orte-checkpoint$(EXEEXT) ompi-checkpoint$(EXEEXT)) (cd $(DESTDIR)$(bindir); rm -f ompi-restart$(EXEEXT); $(LN_S) orte-restart$(EXEEXT) ompi-restart$(EXEEXT)) @@ -58,7 +58,8 @@ uninstall-local: $(DESTDIR)$(bindir)/ompi-clean$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-top$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-server$(EXEEXT) \ - $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) + $(DESTDIR)$(bindir)/ompi-dvm$(EXEEXT) \ + $(DESTDIR)$(bindir)/ompi-submit$(EXEEXT) if WANT_FT rm -f $(DESTDIR)$(bindir)/ompi-checkpoint$(EXEEXT) \ $(DESTDIR)$(bindir)/ompi-restart$(EXEEXT) \ @@ -121,5 +122,8 @@ ompi-server.1: $(top_builddir)/orte/tools/orte-server/orte-server.1 ompi-dvm.1: $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 cp -f $(top_builddir)/orte/tools/orte-dvm/orte-dvm.1 ompi-dvm.1 +ompi-submit.1: $(top_builddir)/orte/tools/orte-submit/orte-submit.1 + cp -f $(top_builddir)/orte/tools/orte-submit/orte-submit.1 ompi-submit.1 + clean-local: rm -f $(man_pages) diff --git a/opal/mca/base/base.h b/opal/mca/base/base.h index 7d31a0277b2..1fdcbd899d7 100644 --- a/opal/mca/base/base.h +++ b/opal/mca/base/base.h @@ -156,7 +156,7 @@ OPAL_DECLSPEC int mca_base_is_component_required(opal_list_t *components_availab /* mca_base_cmd_line.c */ OPAL_DECLSPEC int mca_base_cmd_line_setup(opal_cmd_line_t *cmd); -OPAL_DECLSPEC int mca_base_cmd_line_process_args(char **argv, +OPAL_DECLSPEC int mca_base_cmd_line_process_args(opal_cmd_line_t *cmd, char ***app_env, char ***global_env); OPAL_DECLSPEC void mca_base_cmd_line_wrap_args(char **args); diff --git a/opal/mca/base/mca_base_cmd_line.c b/opal/mca/base/mca_base_cmd_line.c index ded9b22e7c6..d8319167011 100644 --- a/opal/mca/base/mca_base_cmd_line.c +++ b/opal/mca/base/mca_base_cmd_line.c @@ -94,25 +94,29 @@ int mca_base_cmd_line_setup(opal_cmd_line_t *cmd) /* * Look for and handle any -mca options on the command line */ -int mca_base_cmd_line_process_args(char **argv, +int mca_base_cmd_line_process_args(opal_cmd_line_t *cmd, char ***context_env, char ***global_env) { - int i, rc; + int i, num_insts, rc; char **params; char **values; + /* If no relevant parameters were given, just return */ + + if (!opal_cmd_line_is_taken(cmd, OPAL_MCA_CMD_LINE_ID) && + !opal_cmd_line_is_taken(cmd, "g"OPAL_MCA_CMD_LINE_ID)) { + return OPAL_SUCCESS; + } + + /* Handle app context-specific parameters */ + + num_insts = opal_cmd_line_get_ninsts(cmd, OPAL_MCA_CMD_LINE_ID); params = values = NULL; - for (i = 0; NULL != argv[i]; ++i) { - if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID, argv[i]) || - 0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i])) { - if (NULL == argv[i+1] || NULL == argv[i+2]) { - return OPAL_ERR_BAD_PARAM; - } - if (OPAL_SUCCESS != (rc = process_arg(argv[i+1], argv[i+2], - ¶ms, &values))) { - return rc; - } - i += 2; + for (i = 0; i < num_insts; ++i) { + if (OPAL_SUCCESS != (rc = process_arg(opal_cmd_line_get_param(cmd, OPAL_MCA_CMD_LINE_ID, i, 0), + opal_cmd_line_get_param(cmd, OPAL_MCA_CMD_LINE_ID, i, 1), + ¶ms, &values))) { + return rc; } } if (NULL != params) { @@ -121,19 +125,15 @@ int mca_base_cmd_line_process_args(char **argv, opal_argv_free(values); } + /* Handle global parameters */ + num_insts = opal_cmd_line_get_ninsts(cmd, "g"OPAL_MCA_CMD_LINE_ID); params = values = NULL; - for (i = 0; NULL != argv[i]; ++i) { - if (0 == strcmp("-g"OPAL_MCA_CMD_LINE_ID, argv[i]) || - 0 == strcmp("--g"OPAL_MCA_CMD_LINE_ID, argv[i])) { - if (NULL == argv[i+1] || NULL == argv[i+2]) { - return OPAL_ERR_BAD_PARAM; - } - if (OPAL_SUCCESS != (rc = process_arg(argv[i+1], argv[i+2], - ¶ms, &values))) { - return rc; - } - i += 2; + for (i = 0; i < num_insts; ++i) { + if (OPAL_SUCCESS != (rc = process_arg(opal_cmd_line_get_param(cmd, "g"OPAL_MCA_CMD_LINE_ID, i, 0), + opal_cmd_line_get_param(cmd, "g"OPAL_MCA_CMD_LINE_ID, i, 1), + ¶ms, &values))) { + return rc; } } if (NULL != params) { @@ -190,6 +190,7 @@ static int process_arg(const char *param, const char *value, /* If we didn't already have an value for the same param, save this one away */ + opal_argv_append_nosize(params, param); opal_argv_append_nosize(values, p1); free(p1); diff --git a/opal/runtime/opal_info_support.c b/opal/runtime/opal_info_support.c index 0db87893218..3f4694ce27e 100644 --- a/opal/runtime/opal_info_support.c +++ b/opal/runtime/opal_info_support.c @@ -207,7 +207,7 @@ int opal_info_init(int argc, char **argv, exit(cmd_error ? 1 : 0); } - mca_base_cmd_line_process_args(argv, &app_env, &global_env); + mca_base_cmd_line_process_args(opal_info_cmd_line, &app_env, &global_env); /* set the flags */ diff --git a/opal/util/cmd_line.c b/opal/util/cmd_line.c index 059bc38fa9b..3aa85640920 100644 --- a/opal/util/cmd_line.c +++ b/opal/util/cmd_line.c @@ -156,9 +156,7 @@ int opal_cmd_line_create(opal_cmd_line_t *cmd, } OBJ_CONSTRUCT(cmd, opal_cmd_line_t); - if (NULL != table) { - ret = opal_cmd_line_add(cmd, table); - } + ret = opal_cmd_line_add(cmd, table); return ret; } diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index db490b15f8e..7649b155e0e 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -76,7 +76,6 @@ #include "orte/mca/state/base/base.h" #include "orte/mca/state/state.h" -#include "orte/orted/orted_submit.h" #include "orte/orted/pmix/pmix_server.h" #include "orte/util/show_help.h" @@ -714,14 +713,6 @@ static int rte_init(void) goto error; } - /* setup to support debugging */ - orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS, - orte_debugger_init_after_spawn, - ORTE_SYS_PRI); - orte_state.add_job_state(ORTE_JOB_STATE_DEBUGGER_DETACH, - orte_debugger_detached, - ORTE_SYS_PRI); - /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ diff --git a/orte/mca/grpcomm/base/grpcomm_base_stubs.c b/orte/mca/grpcomm/base/grpcomm_base_stubs.c index ef5874067a0..621b645da2b 100644 --- a/orte/mca/grpcomm/base/grpcomm_base_stubs.c +++ b/orte/mca/grpcomm/base/grpcomm_base_stubs.c @@ -324,7 +324,7 @@ static int create_dmns(orte_grpcomm_signature_t *sig, *dmns = NULL; return ORTE_ERR_NOT_FOUND; } - if (0 == jdata->map->num_nodes) { + if (NULL == jdata->map) { /* we haven't generated a job map yet - if we are the HNP, * then we should only involve ourselves. Otherwise, we have * no choice but to abort to avoid hangs */ @@ -340,6 +340,12 @@ static int create_dmns(orte_grpcomm_signature_t *sig, *dmns = NULL; return ORTE_ERR_NOT_FOUND; } + /* get the array */ + if (0 == jdata->map->num_nodes) { + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_SILENT; + } dns = (orte_vpid_t*)malloc(jdata->map->num_nodes * sizeof(vpid)); nds = 0; for (i=0; i < jdata->map->nodes->size && (int)nds < jdata->map->num_nodes; i++) { diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index e52347a65e4..e048cd64df9 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -1537,9 +1537,6 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } - if (NULL == daemons->map) { - daemons->map = OBJ_NEW(orte_job_map_t); - } map = daemons->map; /* if this job is being launched against a fixed DVM, then there is @@ -1555,7 +1552,8 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) * the virtual machine unless specifically requested to do so */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { - if (0 == map->num_nodes) { + OBJ_CONSTRUCT(&nodes, opal_list_t); + if (NULL == daemons->map) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setup_vm creating map", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -1564,15 +1562,16 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) * are obviously already here! The ess will already * have assigned our node to us. */ + daemons->map = OBJ_NEW(orte_job_map_t); node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - opal_pointer_array_add(map->nodes, (void*)node); - ++(map->num_nodes); + opal_pointer_array_add(daemons->map->nodes, (void*)node); + ++(daemons->map->num_nodes); /* maintain accounting */ OBJ_RETAIN(node); /* mark that this is from a singleton */ singleton = true; } - OBJ_CONSTRUCT(&nodes, opal_list_t); + map = daemons->map; for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; @@ -1619,6 +1618,16 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) */ if (orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL)) { OBJ_CONSTRUCT(&nodes, opal_list_t); + if (NULL == daemons->map) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, + "%s plm:base:setup_vm creating map", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* this is the first time thru, so the vm is just getting + * defined - create a map for it + */ + daemons->map = OBJ_NEW(orte_job_map_t); + } + map = daemons->map; /* loop across all nodes and include those that have * num_procs > 0 && no daemon already on them */ @@ -1676,21 +1685,23 @@ int orte_plm_base_setup_virtual_machine(orte_job_t *jdata) goto process; } - if (0 == map->num_nodes) { + if (NULL == daemons->map) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:setup_vm creating map", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* this is the first time thru, so the vm is just getting - * defined - put us in as we + * defined - create a map for it and put us in as we * are obviously already here! The ess will already * have assigned our node to us. */ + daemons->map = OBJ_NEW(orte_job_map_t); node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - opal_pointer_array_add(map->nodes, (void*)node); - ++(map->num_nodes); + opal_pointer_array_add(daemons->map->nodes, (void*)node); + ++(daemons->map->num_nodes); /* maintain accounting */ OBJ_RETAIN(node); } + map = daemons->map; /* zero-out the number of new daemons as we will compute this * each time we are called diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 7bb6af1f6f0..226ee68e863 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -110,10 +110,8 @@ static int ppr_mapper(orte_job_t *jdata) ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* not for us */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:ppr: job %s not using ppr mapper PPR %s policy %s", - ORTE_JOBID_PRINT(jdata->jobid), - (NULL == jdata->map->ppr) ? "NULL" : jdata->map->ppr, - (ORTE_MAPPING_PPR == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) ? "PPRSET" : "PPR NOTSET"); + "mca:rmaps:ppr: job %s not using ppr mapper", + ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } diff --git a/orte/mca/schizo/base/base.h b/orte/mca/schizo/base/base.h index fd33fb10c9e..1cb72d702c7 100644 --- a/orte/mca/schizo/base/base.h +++ b/orte/mca/schizo/base/base.h @@ -41,7 +41,6 @@ ORTE_DECLSPEC int orte_schizo_base_select(void); typedef struct { /* list of active modules */ opal_list_t active_modules; - char **personalities; } orte_schizo_base_t; /** @@ -62,13 +61,15 @@ OBJ_CLASS_DECLARATION(orte_schizo_base_active_module_t); /* the base stub functions */ ORTE_DECLSPEC const char* orte_schizo_base_print_env(orte_schizo_launch_environ_t env); -ORTE_DECLSPEC int orte_schizo_base_define_cli(opal_cmd_line_t *cli); -ORTE_DECLSPEC int orte_schizo_base_parse_cli(int argc, int start, char **argv); -ORTE_DECLSPEC int orte_schizo_base_parse_env(char *path, +ORTE_DECLSPEC int orte_schizo_base_parse_cli(char **personality, + int argc, int start, char **argv); +ORTE_DECLSPEC int orte_schizo_base_parse_env(char **personality, + char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv); -ORTE_DECLSPEC int orte_schizo_base_setup_app(orte_app_context_t *app); +ORTE_DECLSPEC int orte_schizo_base_setup_app(char **personality, + orte_app_context_t *app); ORTE_DECLSPEC int orte_schizo_base_setup_fork(orte_job_t *jdata, orte_app_context_t *context); ORTE_DECLSPEC int orte_schizo_base_setup_child(orte_job_t *jobdat, diff --git a/orte/mca/schizo/base/schizo_base_frame.c b/orte/mca/schizo/base/schizo_base_frame.c index 0665aff9d46..c9fb70650f5 100644 --- a/orte/mca/schizo/base/schizo_base_frame.c +++ b/orte/mca/schizo/base/schizo_base_frame.c @@ -37,7 +37,6 @@ */ orte_schizo_base_t orte_schizo_base = {{{0}}}; orte_schizo_base_module_t orte_schizo = { - .define_cli = orte_schizo_base_define_cli, .parse_cli = orte_schizo_base_parse_cli, .parse_env = orte_schizo_base_parse_env, .setup_app = orte_schizo_base_setup_app, @@ -47,28 +46,10 @@ orte_schizo_base_module_t orte_schizo = { .finalize = orte_schizo_base_finalize }; -static char *personalities = NULL; - -static int orte_schizo_base_register(mca_base_register_flag_t flags) -{ - /* pickup any defined personalities */ - personalities = NULL; - mca_base_var_register("orte", "schizo", "base", "personalities", - "Comma-separated list of personalities", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &personalities); - return ORTE_SUCCESS; -} - static int orte_schizo_base_close(void) { /* cleanup globals */ OPAL_LIST_DESTRUCT(&orte_schizo_base.active_modules); - if (NULL != orte_schizo_base.personalities) { - opal_argv_free(orte_schizo_base.personalities); - } return mca_base_framework_components_close(&orte_schizo_base_framework, NULL); } @@ -83,10 +64,6 @@ static int orte_schizo_base_open(mca_base_open_flag_t flags) /* init the globals */ OBJ_CONSTRUCT(&orte_schizo_base.active_modules, opal_list_t); - orte_schizo_base.personalities = NULL; - if (NULL != personalities) { - orte_schizo_base.personalities = opal_argv_split(personalities, ','); - } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_schizo_base_framework, flags); @@ -96,8 +73,7 @@ static int orte_schizo_base_open(mca_base_open_flag_t flags) } MCA_BASE_FRAMEWORK_DECLARE(orte, schizo, "ORTE Schizo Subsystem", - orte_schizo_base_register, - orte_schizo_base_open, orte_schizo_base_close, + NULL, orte_schizo_base_open, orte_schizo_base_close, mca_schizo_base_static_components, 0); OBJ_CLASS_INSTANCE(orte_schizo_base_active_module_t, diff --git a/orte/mca/schizo/base/schizo_base_select.c b/orte/mca/schizo/base/schizo_base_select.c index d37a661abaa..00fc0b0da95 100644 --- a/orte/mca/schizo/base/schizo_base_select.c +++ b/orte/mca/schizo/base/schizo_base_select.c @@ -28,6 +28,8 @@ * available. */ +static bool selected = false; + int orte_schizo_base_select(void) { mca_base_component_list_item_t *cli = NULL; @@ -38,10 +40,11 @@ int orte_schizo_base_select(void) int rc, priority; bool inserted; - if (0 < opal_list_get_size(&orte_schizo_base.active_modules)) { + if (selected) { /* ensure we don't do this twice */ return ORTE_SUCCESS; } + selected = true; /* Query all available components and ask if they have a module */ OPAL_LIST_FOREACH(cli, &orte_schizo_base_framework.framework_components, mca_base_component_list_item_t) { diff --git a/orte/mca/schizo/base/schizo_base_stubs.c b/orte/mca/schizo/base/schizo_base_stubs.c index cd50b7a820e..21794c3d59d 100644 --- a/orte/mca/schizo/base/schizo_base_stubs.c +++ b/orte/mca/schizo/base/schizo_base_stubs.c @@ -37,31 +37,19 @@ } } -int orte_schizo_base_define_cli(opal_cmd_line_t *cli) +int orte_schizo_base_parse_cli(char **personality, + int argc, int start, char **argv) { int rc; orte_schizo_base_active_module_t *mod; - OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { - if (NULL != mod->module->define_cli) { - rc = mod->module->define_cli(cli); - if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { - ORTE_ERROR_LOG(rc); - return rc; - } - } + if (NULL == personality) { + return ORTE_ERR_NOT_SUPPORTED; } - return ORTE_SUCCESS; -} - -int orte_schizo_base_parse_cli(int argc, int start, char **argv) -{ - int rc; - orte_schizo_base_active_module_t *mod; OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (NULL != mod->module->parse_cli) { - rc = mod->module->parse_cli(argc, start, argv); + rc = mod->module->parse_cli(personality, argc, start, argv); if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; @@ -71,7 +59,8 @@ int orte_schizo_base_parse_cli(int argc, int start, char **argv) return ORTE_SUCCESS; } -int orte_schizo_base_parse_env(char *path, +int orte_schizo_base_parse_env(char **personality, + char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv) @@ -81,7 +70,7 @@ int orte_schizo_base_parse_env(char *path, OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (NULL != mod->module->parse_env) { - rc = mod->module->parse_env(path, cmd_line, srcenv, dstenv); + rc = mod->module->parse_env(personality, path, cmd_line, srcenv, dstenv); if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; @@ -91,14 +80,15 @@ int orte_schizo_base_parse_env(char *path, return ORTE_SUCCESS; } -int orte_schizo_base_setup_app(orte_app_context_t *app) +int orte_schizo_base_setup_app(char **personality, + orte_app_context_t *app) { int rc; orte_schizo_base_active_module_t *mod; OPAL_LIST_FOREACH(mod, &orte_schizo_base.active_modules, orte_schizo_base_active_module_t) { if (NULL != mod->module->setup_app) { - rc = mod->module->setup_app(app); + rc = mod->module->setup_app(personality, app); if (ORTE_SUCCESS != rc && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index 786d166bd64..9eba49c2bd8 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -42,17 +42,17 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/ess/base/base.h" #include "orte/mca/rmaps/rmaps_types.h" -#include "orte/orted/orted_submit.h" #include "orte/util/name_fns.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" -#include "orte/mca/schizo/base/base.h" +#include "orte/mca/schizo/schizo.h" -static int define_cli(opal_cmd_line_t *cli); -static int parse_cli(int argc, int start, char **argv); -static int parse_env(char *path, +static int parse_cli(char **personality, + int argc, int start, char **argv); +static int parse_env(char **personality, + char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv); @@ -63,426 +63,14 @@ static int setup_child(orte_job_t *jobdat, orte_app_context_t *app); orte_schizo_base_module_t orte_schizo_ompi_module = { - .define_cli = define_cli, .parse_cli = parse_cli, .parse_env = parse_env, .setup_fork = setup_fork, .setup_child = setup_child }; - -static opal_cmd_line_init_t cmd_line_init[] = { - /* Various "obvious" options */ - { NULL, 'h', NULL, "help", 0, - &orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL, - "This help message" }, - { NULL, 'V', NULL, "version", 0, - &orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL, - "Print version and exit" }, - { NULL, 'v', NULL, "verbose", 0, - &orte_cmd_line.verbose, OPAL_CMD_LINE_TYPE_BOOL, - "Be verbose" }, - { "orte_execute_quiet", 'q', NULL, "quiet", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Suppress helpful messages" }, - { NULL, '\0', "report-pid", "report-pid", 1, - &orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING, - "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, - { NULL, '\0', "report-uri", "report-uri", 1, - &orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING, - "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, - - /* exit status reporting */ - { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Return the exit status of the primary job only" }, - - /* uri of the dvm, or at least where to get it */ - { NULL, '\0', "hnp", "hnp", 1, - &orte_cmd_line.hnp, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the HNP, or the name of the file (specified as file:filename) that contains that info" }, - - /* hetero apps */ - { "orte_hetero_apps", '\0', NULL, "hetero-apps", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries" }, - - /* select XML output */ - { "orte_xml_output", '\0', "xml", "xml", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Provide all output in XML format" }, - { "orte_xml_file", '\0', "xml-file", "xml-file", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide all output in XML format to the specified file" }, - - /* tag output */ - { "orte_tag_output", '\0', "tag-output", "tag-output", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Tag all output with [job,rank]" }, - { "orte_timestamp_output", '\0', "timestamp-output", "timestamp-output", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Timestamp all application process output" }, - { "orte_output_filename", '\0', "output-filename", "output-filename", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, - { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, - &orte_cmd_line.merge, OPAL_CMD_LINE_TYPE_BOOL, - "Merge stderr to stdout for each process"}, - { "orte_xterm", '\0', "xterm", "xterm", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Create a new xterm window and display output from the specified ranks there" }, - - /* select stdin option */ - { NULL, '\0', "stdin", "stdin", 1, - &orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING, - "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, - - /* request that argv[0] be indexed */ - { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, - &orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL, - "Uniquely index argv[0] for each process using its rank" }, - - /* Specify the launch agent to be used */ - { "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Command used to start processes on remote nodes (default: orted)" }, - - /* Preload the binary on the remote machine */ - { NULL, 's', NULL, "preload-binary", 0, - &orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, - "Preload the binary on the remote machine before starting the remote process." }, - - /* Preload files on the remote machine */ - { NULL, '\0', NULL, "preload-files", 1, - &orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING, - "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, - -#if OPAL_ENABLE_FT_CR == 1 - /* Tell SStore to preload a snapshot before launch */ - { NULL, '\0', NULL, "sstore-load", 1, - &orte_cmd_line.sstore_load, OPAL_CMD_LINE_TYPE_STRING, - "Internal Use Only! Tell SStore to preload a snapshot before launch." }, -#endif - - /* Use an appfile */ - { NULL, '\0', NULL, "app", 1, - &orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING, - "Provide an appfile; ignore all other command line options" }, - - /* Number of processes; -c, -n, --n, -np, and --np are all - synonyms */ - { NULL, 'c', "np", "np", 1, - &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - { NULL, '\0', "n", "n", 1, - &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - - /* maximum size of VM - typically used to subdivide an allocation */ - { "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Number of processes to run" }, - - /* Set a hostfile */ - { NULL, '\0', "hostfile", "hostfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, - { NULL, '\0', "machinefile", "machinefile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a hostfile" }, - { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a default hostfile" }, - { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Do not attempt to resolve interfaces" }, - - /* uri of PMIx publish/lookup server, or at least where to get it */ - { "pmix_server_uri", '\0', "ompi-server", "ompi-server", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info" }, - - { "carto_file_path", '\0', "cf", "cartofile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a cartography file" }, - - { "orte_rankfile", '\0', "rf", "rankfile", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Provide a rankfile file" }, - - /* Export environment variables; potentially used multiple times, - so it does not make sense to set into a variable */ - { NULL, 'x', NULL, NULL, 1, - NULL, OPAL_CMD_LINE_TYPE_NULL, - "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, - - /* Mapping controls */ - { "rmaps_base_display_map", '\0', "display-map", "display-map", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the process map just before launch"}, - { "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a detailed process map (mostly intended for developers) just before launch"}, - { "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the topology as part of the process map (mostly intended for developers) just before launch"}, - { "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a diffable process map (mostly intended for developers) just before launch"}, - { NULL, 'H', "host", "host", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "List of hosts to invoke processes on" }, - { "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Do not run any MPI applications on the local node" }, - { "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are not to be oversubscribed, even if the system supports such operation"}, - { "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, - { "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Number of cpus to use for each process [default=1]" }, - { "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Synonym for cpus-per-proc" }, - - /* backward compatiblity */ - { "rmaps_base_bycore", '\0', "bycore", "bycore", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by core" }, - { "rmaps_base_bynode", '\0', "bynode", "bynode", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by node" }, - { "rmaps_base_byslot", '\0', "byslot", "byslot", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to map and rank processes round-robin by slot" }, - - /* Nperxxx options that do not require topology and are always - * available - included for backwards compatibility - */ - { "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Launch one process per available node" }, - { "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes" }, - { "rmaps_ppr_n_pernode", '\0', "N", NULL, 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per node on all allocated nodes (synonym for npernode)" }, - - /* declare hardware threads as independent cpus */ - { "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Use hardware threads as independent cpus" }, - - /* include npersocket for backwards compatibility */ - { "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Launch n processes per socket on all allocated nodes" }, - - /* Mapping options */ - { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, - - /* Ranking options */ - { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, - - /* Binding options */ - { "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, - - /* backward compatiblity */ - { "hwloc_base_bind_to_core", '\0', "bind-to-core", "bind-to-core", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Bind processes to cores" }, - { "hwloc_base_bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Bind processes to sockets" }, - - { "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Whether to report process bindings to stderr" }, - - /* slot list option */ - { "hwloc_base_slot_list", '\0', "slot-list", "slot-list", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "List of processor IDs to bind processes to [default=NULL]"}, - - /* generalized pattern mapping option */ - { "rmaps_ppr_pattern", '\0', NULL, "ppr", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of number of processes on a given resource type [default: none]" }, - - /* Allocation options */ - { "orte_display_alloc", '\0', "display-allocation", "display-allocation", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display the allocation being used by this job"}, - { "orte_display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, - { "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, - - /* mpiexec-like arguments */ - { NULL, '\0', "wdir", "wdir", 1, - &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Set the working directory of the started processes" }, - { NULL, '\0', "wd", "wd", 1, - &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, - "Synonym for --wdir" }, - { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, - &orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, - "Set the working directory of the started processes to their session directory" }, - { NULL, '\0', "path", "path", 1, - &orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING, - "PATH to be used to look for executables to start processes" }, - - /* User-level debugger arguments */ - { NULL, '\0', "tv", "tv", 0, - &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, - "Deprecated backwards compatibility flag; synonym for \"--debug\"" }, - { NULL, '\0', "debug", "debug", 0, - &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, - "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" }, - { "orte_base_user_debugger", '\0', "debugger", "debugger", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Sequence of debuggers to search for when \"--debug\" is used" }, - { "orte_output_debugger_proctable", '\0', "output-proctable", "output-proctable", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Output the debugger proctable after launch" }, - - /* OpenRTE arguments */ - { "orte_debug", 'd', "debug-devel", "debug-devel", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, - - { "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Enable debugging of any OpenRTE daemons used by this application" }, - - { "orte_debug_daemons_file", '\0', "debug-daemons-file", "debug-daemons-file", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, - - { "orte_leave_session_attached", '\0', "leave-session-attached", "leave-session-attached", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable debugging of OpenRTE" }, - - { "orte_do_not_launch", '\0', "do-not-launch", "do-not-launch", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, - - { NULL, '\0', NULL, "prefix", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Prefix where Open MPI is installed on remote nodes" }, - { NULL, '\0', NULL, "noprefix", 0, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Disable automatic --prefix behavior" }, - - { "orte_report_launch_progress", '\0', "show-progress", "show-progress", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Output a brief periodic report on launch progress" }, - - { "orte_use_regexp", '\0', "use-regexp", "use-regexp", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Use regular expressions for launch" }, - - { "orte_report_events", '\0', "report-events", "report-events", 1, - NULL, OPAL_CMD_LINE_TYPE_STRING, - "Report events to a tool listening at the specified URI" }, - - { "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable recovery from process failure [Default = disabled]" }, - - { "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1, - NULL, OPAL_CMD_LINE_TYPE_INT, - "Max number of times to restart a failed process" }, - - { "orte_hetero_nodes", '\0', NULL, "hetero-nodes", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, - -#if OPAL_ENABLE_CRDEBUG == 1 - { "opal_cr_enable_crdebug", '\0', "crdebug", "crdebug", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Enable C/R Debugging" }, -#endif - - { NULL, '\0', "disable-recovery", "disable-recovery", 0, - &orte_cmd_line.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL, - "Disable recovery (resets all recovery options to off)" }, - - { "state_novm_select", '\0', "novm", "novm", 0, - NULL, OPAL_CMD_LINE_TYPE_BOOL, - "Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)" }, - - { NULL, '\0', "staged", "staged", 0, - &orte_cmd_line.staged_exec, OPAL_CMD_LINE_TYPE_BOOL, - "Used staged execution if inadequate resources are present (cannot support MPI jobs)" }, - - { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, - &orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, - "Allow execution as root (STRONGLY DISCOURAGED)" }, - - { NULL, '\0', "personality", "personality", 1, - &orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING, - "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, - - { NULL, '\0', "dvm", "dvm", 0, - &orte_cmd_line.create_dvm, OPAL_CMD_LINE_TYPE_BOOL, - "Create a persistent distributed virtual machine (DVM)" }, - - /* tell the dvm to terminate */ - { NULL, '\0', "terminate", "terminate", 0, - &orte_cmd_line.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL, - "Terminate the DVM" }, - - /* End of list */ - { NULL, '\0', NULL, NULL, 0, - NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } -}; - -static int define_cli(opal_cmd_line_t *cli) -{ - int i, rc; - bool takeus = false; - - opal_output_verbose(1, orte_schizo_base_framework.framework_output, - "%s schizo:ompi: define_cli", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* protect against bozo error */ - if (NULL == cli) { - return ORTE_ERR_BAD_PARAM; - } - - if (NULL != orte_schizo_base.personalities) { - /* if we aren't included, then ignore us */ - for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { - if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { - takeus = true; - break; - } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; - } - } - - /* just add ours to the end */ - rc = opal_cmd_line_add(cli, cmd_line_init); - return rc; -} - -static int parse_cli(int argc, int start, char **argv) +static int parse_cli(char **personality, + int argc, int start, char **argv) { int i, j, k; bool ignore; @@ -495,25 +83,15 @@ static int parse_cli(int argc, int start, char **argv) }; bool takeus = false; - opal_output_verbose(1, orte_schizo_base_framework.framework_output, - "%s schizo:ompi: parse_cli", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - /* if they gave us a list of personalities, - * see if we are included */ - if (NULL != orte_schizo_base.personalities) { - for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { - if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { - takeus = true; - break; - } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; + /* see if we are included */ + for (i=0; NULL != personality[i]; i++) { + if (0 == strcmp(personality[i], "ompi")) { + takeus = true; + break; } - } else { - /* attempt to auto-detect CLI options that - * we recognize */ + } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; } for (i = 0; i < (argc-start); ++i) { @@ -584,7 +162,8 @@ static int parse_cli(int argc, int start, char **argv) return ORTE_SUCCESS; } -static int parse_env(char *path, +static int parse_env(char **personality, + char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv) @@ -596,22 +175,16 @@ static int parse_env(char *path, char **vars; bool takeus = false; - opal_output_verbose(1, orte_schizo_base_framework.framework_output, - "%s schizo:ompi: parse_env", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if (NULL != orte_schizo_base.personalities) { - /* see if we are included */ - for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { - if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { - takeus = true; - break; - } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; + /* see if we are included */ + for (i=0; NULL != personality[i]; i++) { + if (0 == strcmp(personality[i], "ompi")) { + takeus = true; + break; } } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } for (i = 0; NULL != srcenv[i]; ++i) { if (0 == strncmp("OMPI_", srcenv[i], 5)) { @@ -734,22 +307,16 @@ static int setup_fork(orte_job_t *jdata, char *num_app_ctx; bool takeus = false; - opal_output_verbose(1, orte_schizo_base_framework.framework_output, - "%s schizo:ompi: setup_fork", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if (NULL != orte_schizo_base.personalities) { /* see if we are included */ - for (i=0; NULL != jdata->personality[i]; i++) { - if (0 == strcmp(jdata->personality[i], "ompi")) { - takeus = true; - break; - } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; + for (i=0; NULL != jdata->personality[i]; i++) { + if (0 == strcmp(jdata->personality[i], "ompi")) { + takeus = true; + break; } } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } /* see if the mapper thinks we are oversubscribed */ oversubscribed = false; @@ -972,22 +539,16 @@ static int setup_child(orte_job_t *jdata, int32_t nrestarts=0, *nrptr; bool takeus = false; - opal_output_verbose(1, orte_schizo_base_framework.framework_output, - "%s schizo:ompi: setup_child", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - if (NULL != orte_schizo_base.personalities) { - /* see if we are included */ - for (i=0; NULL != jdata->personality[i]; i++) { - if (0 == strcmp(jdata->personality[i], "ompi")) { - takeus = true; - break; - } - } - if (!takeus) { - return ORTE_ERR_TAKE_NEXT_OPTION; + /* see if we are included */ + for (i=0; NULL != jdata->personality[i]; i++) { + if (0 == strcmp(jdata->personality[i], "ompi")) { + takeus = true; + break; } } + if (!takeus) { + return ORTE_ERR_TAKE_NEXT_OPTION; + } /* setup the jobid */ if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) { diff --git a/orte/mca/schizo/schizo.h b/orte/mca/schizo/schizo.h index 932b6dfc47e..5906a1e22fd 100644 --- a/orte/mca/schizo/schizo.h +++ b/orte/mca/schizo/schizo.h @@ -44,40 +44,32 @@ BEGIN_C_DECLS * things it requires */ typedef int (*orte_schizo_base_module_init_fn_t)(void); -/* provide an opportunity for components to add personality and/or - * environment-specific command line options. The OPAL cli tools - * will add provided options to the CLI definition, and so the - * resulting CLI array will include the _union_ of options provided - * by the various components. Where there is overlap (i.e., an option - * is added that was also defined earlier in the stack), then the - * first definition is used. This reflects the higher priority of - * the original definition - note that this only impacts the help - * message that will be displayed */ -typedef int (*orte_schizo_base_module_define_cli_fn_t)(opal_cmd_line_t *cli); - -/* parse a tool command line +/* given an argv-array of personalities, parse a tool command line * starting from the given location according to the cmd line options * known to this module's personality. First, of course, check that - * this module is included in the base array of personalities, or is - * automatically recognizable! */ -typedef int (*orte_schizo_base_module_parse_cli_fn_t)(int argc, int start, + * this module is included in the specified array of personalities! + * Only one command-line parser is allowed to operate - i.e., if */ +typedef int (*orte_schizo_base_module_parse_cli_fn_t)(char **personality, + int argc, int start, char **argv); -/* parse the environment of the +/* given an argv-array of personalities, parse the environment of the * tool to extract any personality-specific envars that need to be * forward to the app's environment upon execution */ -typedef int (*orte_schizo_base_module_parse_env_fn_t)(char *path, +typedef int (*orte_schizo_base_module_parse_env_fn_t)(char **personality, + char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv); -/* do whatever preparation work +/* given an argv-array of personalities, do whatever preparation work * is required to setup the app for execution. This is intended to be * used by orterun and other launcher tools to, for example, change * an executable's relative-path to an absolute-path, or add a command * required for starting a particular kind of application (e.g., adding * "java" to start a Java application) */ -typedef int (*orte_schizo_base_module_setup_app_fn_t)(orte_app_context_t *app); +typedef int (*orte_schizo_base_module_setup_app_fn_t)(char **personality, + orte_app_context_t *app); /* add any personality-specific envars required at the job level prior * to beginning to execute local procs */ @@ -115,7 +107,6 @@ typedef void (*orte_schizo_base_module_finalize_fn_t)(void); */ typedef struct { orte_schizo_base_module_init_fn_t init; - orte_schizo_base_module_define_cli_fn_t define_cli; orte_schizo_base_module_parse_cli_fn_t parse_cli; orte_schizo_base_module_parse_env_fn_t parse_env; orte_schizo_base_module_setup_app_fn_t setup_app; diff --git a/orte/mca/schizo/singularity/schizo_singularity.c b/orte/mca/schizo/singularity/schizo_singularity.c index bc70b56787a..059347bf090 100644 --- a/orte/mca/schizo/singularity/schizo_singularity.c +++ b/orte/mca/schizo/singularity/schizo_singularity.c @@ -29,7 +29,8 @@ #include "schizo_singularity.h" -static int setup_app(orte_app_context_t *context); +static int setup_app(char **personality, + orte_app_context_t *context); static int setup_fork(orte_job_t *jdata, orte_app_context_t *context); @@ -38,19 +39,18 @@ orte_schizo_base_module_t orte_schizo_singularity_module = { .setup_fork = setup_fork }; -static int setup_app(orte_app_context_t *app) +static int setup_app(char **personality, + orte_app_context_t *app) { int i; char *newenv, *pth, *t2; bool takeus = false; - if (NULL != orte_schizo_base.personalities) { - /* see if we are included */ - for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { - if (0 == strcmp(orte_schizo_base.personalities[i], "singularity")) { - takeus = true; - break; - } + /* see if we are included */ + for (i=0; NULL != personality[i]; i++) { + if (0 == strcmp(personality[i], "singularity")) { + takeus = true; + break; } } if (!takeus) { @@ -113,13 +113,11 @@ static int setup_fork(orte_job_t *jdata, char *p, *t2; char dir[MAXPATHLEN]; - if (NULL != orte_schizo_base.personalities) { - /* see if we are included */ - for (i=0; NULL != jdata->personality[i]; i++) { - if (0 == strcmp(jdata->personality[i], "singularity")) { - takeus = true; - break; - } + /* see if we are included */ + for (i=0; NULL != jdata->personality[i]; i++) { + if (0 == strcmp(jdata->personality[i], "singularity")) { + takeus = true; + break; } } if (!takeus) { diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 12452aac117..3ced75ff216 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -263,7 +263,7 @@ int orte_daemon(int argc, char *argv[]) * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ - mca_base_cmd_line_process_args(argv, &environ, &environ); + mca_base_cmd_line_process_args(cmd_line, &environ, &environ); /* Ensure that enough of OPAL is setup for us to be able to run */ /* diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index 281e3071a5d..9d52f1fb0da 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -87,20 +87,16 @@ #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/base/base.h" +#include "orte/mca/schizo/schizo.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/plm/base/plm_private.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/routed/routed.h" -#include "orte/mca/schizo/schizo.h" -#include "orte/mca/state/state.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_quit.h" -#include "orte/util/cmd_line.h" -#include "orte/util/pre_condition_transports.h" #include "orte/util/show_help.h" #include "orted_submit.h" @@ -116,9 +112,187 @@ static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; static opal_pointer_array_t tool_jobs; static opal_cmd_line_t *cmd_line=NULL; static bool mycmdline = false; -int orte_debugger_attach_fd = -1; -bool orte_debugger_fifo_active=false; -opal_event_t *orte_debugger_attach=NULL; + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, 'h', NULL, "help", 0, + &orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + { NULL, 'V', NULL, "version", 0, + &orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + + /* tag output */ + { NULL, '\0', "tag-output", "tag-output", 0, + &orte_cmd_line.tag_output, OPAL_CMD_LINE_TYPE_BOOL, + "Tag all output with [job,rank]" }, + { NULL, '\0', "timestamp-output", "timestamp-output", 0, + &orte_cmd_line.timestamp_output, OPAL_CMD_LINE_TYPE_BOOL, + "Timestamp all application process output" }, + { NULL, '\0', "output-filename", "output-filename", 1, + &orte_cmd_line.output_filename, OPAL_CMD_LINE_TYPE_STRING, + "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, + { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, + &orte_cmd_line.merge, OPAL_CMD_LINE_TYPE_BOOL, + "Merge stderr to stdout for each process"}, + + /* select stdin option */ + { NULL, '\0', "stdin", "stdin", 1, + &orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING, + "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, + + /* request that argv[0] be indexed */ + { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, + &orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL, + "Uniquely index argv[0] for each process using its rank" }, + + /* Preload the binary on the remote machine */ + { NULL, 's', NULL, "preload-binary", 0, + &orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, + "Preload the binary on the remote machine before starting the remote process." }, + + /* Preload files on the remote machine */ + { NULL, '\0', NULL, "preload-files", 1, + &orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING, + "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, + + /* Use an appfile */ + { NULL, '\0', NULL, "app", 1, + &orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING, + "Provide an appfile; ignore all other command line options" }, + + /* Number of processes; -c, -n, --n, -np, and --np are all + synonyms */ + { NULL, 'c', "np", "np", 1, + &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + { NULL, '\0', "n", "n", 1, + &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* uri of the dvm, or at least where to get it */ + { NULL, '\0', "hnp", "hnp", 1, + &orte_cmd_line.hnp, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the Open MPI server, or the name of the file (specified as file:filename) that contains that info" }, + + /* Set a hostfile */ + { NULL, '\0', "hostfile", "hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { NULL, '\0', "machinefile", "machinefile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a default hostfile" }, + { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not attempt to resolve interfaces" }, + + /* Export environment variables; potentially used multiple times, + so it does not make sense to set into a variable */ + { NULL, 'x', NULL, NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_NULL, + "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, + + /* Mapping controls */ + { NULL, 'H', "host", "host", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of hosts to invoke processes on" }, + { NULL, '\0', "nolocal", "nolocal", 0, + &orte_cmd_line.nolocal, OPAL_CMD_LINE_TYPE_BOOL, + "Do not run any MPI applications on the local node" }, + { NULL, '\0', "nooversubscribe", "nooversubscribe", 0, + &orte_cmd_line.no_oversubscribe, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are not to be oversubscribed, even if the system supports such operation"}, + { NULL, '\0', "oversubscribe", "oversubscribe", 0, + &orte_cmd_line.oversubscribe, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, + { NULL, '\0', "cpus-per-proc", "cpus-per-proc", 1, + &orte_cmd_line.cpus_per_proc, OPAL_CMD_LINE_TYPE_INT, + "Number of cpus to use for each process [default=1]" }, + + /* Nperxxx options that do not require topology and are always + * available - included for backwards compatibility + */ + { NULL, '\0', "pernode", "pernode", 0, + &orte_cmd_line.pernode, OPAL_CMD_LINE_TYPE_BOOL, + "Launch one process per available node" }, + { NULL, '\0', "npernode", "npernode", 1, + &orte_cmd_line.npernode, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes" }, + { NULL, '\0', "N", NULL, 1, + &orte_cmd_line.npernode, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes (synonym for npernode)" }, + + /* declare hardware threads as independent cpus */ + { NULL, '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, + &orte_cmd_line.use_hwthreads_as_cpus, OPAL_CMD_LINE_TYPE_BOOL, + "Use hardware threads as independent cpus" }, + + /* include npersocket for backwards compatibility */ + { NULL, '\0', "npersocket", "npersocket", 1, + &orte_cmd_line.npersocket, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per socket on all allocated nodes" }, + + /* Mapping options */ + { NULL, '\0', NULL, "map-by", 1, + &orte_cmd_line.mapping_policy, OPAL_CMD_LINE_TYPE_STRING, + "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, + + /* Ranking options */ + { NULL, '\0', NULL, "rank-by", 1, + &orte_cmd_line.ranking_policy, OPAL_CMD_LINE_TYPE_STRING, + "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, + + /* Binding options */ + { NULL, '\0', NULL, "bind-to", 1, + &orte_cmd_line.binding_policy, OPAL_CMD_LINE_TYPE_STRING, + "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, + + { NULL, '\0', "report-bindings", "report-bindings", 0, + &orte_cmd_line.report_bindings, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to report process bindings to stderr" }, + + /* slot list option */ + { NULL, '\0', "slot-list", "slot-list", 1, + &orte_cmd_line.slot_list, OPAL_CMD_LINE_TYPE_STRING, + "List of processor IDs to bind processes to [default=NULL]"}, + + /* mpiexec-like arguments */ + { NULL, '\0', "wdir", "wdir", 1, + &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Set the working directory of the started processes" }, + { NULL, '\0', "wd", "wd", 1, + &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Synonym for --wdir" }, + { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, + &orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, + "Set the working directory of the started processes to their session directory" }, + { NULL, '\0', "path", "path", 1, + &orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING, + "PATH to be used to look for executables to start processes" }, + + { NULL, '\0', "enable-recovery", "enable-recovery", 0, + &orte_cmd_line.enable_recovery, OPAL_CMD_LINE_TYPE_BOOL, + "Enable recovery (resets all recovery options to on)" }, + + { NULL, '\0', "personality", "personality", 1, + &orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, + + { NULL, 'd', "debug-devel", "debug-devel", 0, + &orte_cmd_line.debug, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, + &orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, + "Allow execution as root (STRONGLY DISCOURAGED)" }, + + /* End of list */ + { NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; /* * Local functions @@ -139,38 +313,6 @@ static void launch_recv(int status, orte_process_name_t* sender, static void complete_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata); -static void attach_debugger(int fd, short event, void *arg); -static void build_debugger_args(orte_app_context_t *debugger); -static void open_fifo (void); -static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, - int argc, char *argv[], int num_procs); - -/* instance the standard MPIR interfaces */ -#define MPIR_MAX_PATH_LENGTH 512 -#define MPIR_MAX_ARG_LENGTH 1024 -struct MPIR_PROCDESC *MPIR_proctable = NULL; -int MPIR_proctable_size = 0; -volatile int MPIR_being_debugged = 0; -volatile int MPIR_debug_state = 0; -int MPIR_i_am_starter = 0; -int MPIR_partial_attach_ok = 1; -char MPIR_executable_path[MPIR_MAX_PATH_LENGTH] = {0}; -char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH] = {0}; -volatile int MPIR_forward_output = 0; -volatile int MPIR_forward_comm = 0; -char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH] = {0}; -int MPIR_force_to_main = 0; -static void orte_debugger_init_before_spawn(orte_job_t *jdata); - -ORTE_DECLSPEC void* MPIR_Breakpoint(void); - -/* - * Breakpoint function for parallel debuggers - */ -void* MPIR_Breakpoint(void) -{ - return NULL; -} /* local objects */ typedef struct { @@ -204,8 +346,6 @@ int orte_submit_init(int argc, char *argv[], opal_cmd_line_t *opts) { int rc; - bool version, help; - char *param; OBJ_CONSTRUCT(&tool_jobs, opal_pointer_array_t); opal_pointer_array_init(&tool_jobs, 256, INT_MAX, 128); @@ -216,8 +356,8 @@ int orte_submit_init(int argc, char *argv[], /* setup the cmd line only once */ if (NULL != opts) { - /* just add the component-defined ones to the end */ - if (OPAL_SUCCESS != (rc = orte_schizo.define_cli(opts))) { + /* just add ours to the end */ + if (OPAL_SUCCESS != (rc = opal_cmd_line_add(opts, cmd_line_init))) { return rc; } cmd_line = opts; @@ -225,46 +365,55 @@ int orte_submit_init(int argc, char *argv[], } else { /* create our own */ cmd_line = OBJ_NEW(opal_cmd_line_t); - rc = orte_cmd_line_create(cmd_line, argc, argv, - &environ, &environ, - &version, &help); - if (ORTE_SUCCESS != rc) { - OBJ_RELEASE(cmd_line); - return rc; + opal_cmd_line_create(cmd_line, cmd_line_init); + mca_base_cmd_line_setup(cmd_line); + mycmdline = true; + } + + /* parse the cmd line - we do this here to get the initial + * MCA parameters that might impact our own init */ + if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(cmd_line, true, + argc, argv)) ) { + if (OPAL_ERR_SILENT != rc) { + fprintf(stderr, "%s: command line error (%s)\n", argv[0], + opal_strerror(rc)); } - /* print version if requested. Do this before check for help so - that --version --help works as one might expect. */ - if (version) { - char *str, *project_name = NULL; - if (0 == strcmp(orte_basename, "mpirun")) { - project_name = "Open MPI"; - } else { - project_name = "OpenRTE"; - } - str = opal_info_make_version_str("all", - OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION, - OPAL_GREEK_VERSION, - OPAL_REPO_REV); - if (NULL != str) { - fprintf(stdout, "%s (%s) %s\n\nReport bugs to %s\n", - orte_basename, project_name, str, PACKAGE_BUGREPORT); - free(str); - } - exit(0); + return rc; + } + + /* print version if requested. Do this before check for help so + that --version --help works as one might expect. */ + if (orte_cmd_line.version) { + char *str; + str = opal_info_make_version_str("all", + OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION, + OPAL_GREEK_VERSION, + OPAL_REPO_REV); + if (NULL != str) { + fprintf(stdout, "%s %s\n\nReport bugs to %s\n", + orte_basename, str, PACKAGE_BUGREPORT); + free(str); } - mycmdline = true; + return ORTE_ERR_SILENT; + } + + /* process MCA/GMCA parameters */ + if (OPAL_SUCCESS != (rc = mca_base_cmd_line_process_args(cmd_line, &environ, &environ))) { + return rc; } /* Need to initialize OPAL so that install_dirs are filled in */ if (OPAL_SUCCESS != (rc = opal_init(&argc, &argv))) { + OBJ_DESTRUCT(&cmd_line); return rc; } /* Check for help request */ - if (help) { + if (orte_cmd_line.help) { char *str, *args = NULL; char *project_name = NULL; + if (0 == strcmp(orte_basename, "mpirun")) { project_name = "Open MPI"; } else { @@ -280,158 +429,67 @@ int orte_submit_init(int argc, char *argv[], free(str); } free(args); + /* If someone asks for help, that should be all we do */ exit(0); } - /* set the flags - if they gave us a -hnp option, then - * we are a tool. If not, then we are an HNP */ + /* if they didn't point us at an HNP, that's an error */ if (NULL == orte_cmd_line.hnp) { - orte_process_info.proc_type = ORTE_PROC_HNP; - } else { - orte_process_info.proc_type = ORTE_PROC_TOOL; + fprintf(stderr, "%s submit: required option --hnp not provided\n", orte_basename); + return ORTE_ERROR; } - if (ORTE_PROC_IS_TOOL) { - if (0 == strncasecmp(orte_cmd_line.hnp, "file", strlen("file"))) { - char input[1024], *filename; - FILE *fp; + if (0 == strncasecmp(orte_cmd_line.hnp, "file", strlen("file"))) { + char input[1024], *filename; + FILE *fp; - /* it is a file - get the filename */ - filename = strchr(orte_cmd_line.hnp, ':'); - if (NULL == filename) { - /* filename is not correctly formatted */ - orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); - exit(1); - } - ++filename; /* space past the : */ + /* it is a file - get the filename */ + filename = strchr(orte_cmd_line.hnp, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); + exit(1); + } + ++filename; /* space past the : */ - if (0 >= strlen(filename)) { - /* they forgot to give us the name! */ - orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); - exit(1); - } + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-bad", true, "uri", orte_cmd_line.hnp); + exit(1); + } - /* open the file and extract the uri */ - fp = fopen(filename, "r"); - if (NULL == fp) { /* can't find or read file! */ - orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, orte_cmd_line.hnp); - exit(1); - } - /* initialize the input to NULLs to ensure any input - * string is NULL-terminated */ - memset(input, 0, 1024); - if (NULL == fgets(input, 1024, fp)) { - /* something malformed about file */ - fclose(fp); - orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, orte_cmd_line.hnp); - exit(1); - } + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orte-top.txt", "orte-top:hnp-filename-access", true, orte_cmd_line.hnp); + exit(1); + } + /* initialize the input to NULLs to ensure any input + * string is NULL-terminated */ + memset(input, 0, 1024); + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ fclose(fp); - input[strlen(input)-1] = '\0'; /* remove newline */ - /* construct the target hnp info */ - opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", input, true, &environ); - } else { - /* should just be the uri itself - construct the target hnp info */ - opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", orte_cmd_line.hnp, true, &environ); + orte_show_help("help-orte-top.txt", "orte-top:hnp-file-bad", true, orte_cmd_line.hnp); + exit(1); } - /* we are never allowed to operate as a distributed tool, - * so insist on the ess/tool component */ - opal_setenv(OPAL_MCA_PREFIX"ess", "tool", true, &environ); + fclose(fp); + input[strlen(input)-1] = '\0'; /* remove newline */ + /* construct the target hnp info */ + opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", input, true, &environ); } else { - /* may look strange, but the way we handle prefix is a little weird - * and probably needs to be addressed more fully at some future point. - * For now, we have a conflict between app_files and cmd line usage. - * Since app_files are used by the C/R system, we will make an - * adjustment here to avoid perturbing that system. - * - * We cannot just have the cmd line parser place any found value - * in the global struct as the app_file parser would replace it. - * So handle this specific cmd line option manually. - */ - orte_cmd_line.prefix = NULL; - orte_cmd_line.path_to_mpirun = NULL; - if (opal_cmd_line_is_taken(cmd_line, "prefix") || - '/' == argv[0][0] || want_prefix_by_default) { - size_t param_len; - if ('/' == argv[0][0]) { - char* tmp_basename = NULL; - /* If they specified an absolute path, strip off the - /bin/" and leave just the prefix */ - orte_cmd_line.path_to_mpirun = opal_dirname(argv[0]); - /* Quick sanity check to ensure we got - something/bin/ and that the installation - tree is at least more or less what we expect it to - be */ - tmp_basename = opal_basename(orte_cmd_line.path_to_mpirun); - if (0 == strcmp("bin", tmp_basename)) { - char* tmp = orte_cmd_line.path_to_mpirun; - orte_cmd_line.path_to_mpirun = opal_dirname(tmp); - free(tmp); - } else { - free(orte_cmd_line.path_to_mpirun); - orte_cmd_line.path_to_mpirun = NULL; - } - free(tmp_basename); - } - /* if both are given, check to see if they match */ - if (opal_cmd_line_is_taken(cmd_line, "prefix") && NULL != orte_cmd_line.path_to_mpirun) { - char *tmp_basename; - /* if they don't match, then that merits a warning */ - param = strdup(opal_cmd_line_get_param(cmd_line, "prefix", 0, 0)); - /* ensure we strip any trailing '/' */ - if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { - param[strlen(param)-1] = '\0'; - } - tmp_basename = strdup(orte_cmd_line.path_to_mpirun); - if (0 == strcmp(OPAL_PATH_SEP, &(tmp_basename[strlen(tmp_basename)-1]))) { - tmp_basename[strlen(tmp_basename)-1] = '\0'; - } - if (0 != strcmp(param, tmp_basename)) { - orte_show_help("help-orterun.txt", "orterun:double-prefix", - true, orte_basename, orte_basename, - param, tmp_basename, orte_basename); - /* use the prefix over the path-to-mpirun so that - * people can specify the backend prefix as different - * from the local one - */ - free(orte_cmd_line.path_to_mpirun); - orte_cmd_line.path_to_mpirun = NULL; - } - free(tmp_basename); - } else if (NULL != orte_cmd_line.path_to_mpirun) { - param = strdup(orte_cmd_line.path_to_mpirun); - } else if (opal_cmd_line_is_taken(cmd_line, "prefix")){ - /* must be --prefix alone */ - param = strdup(opal_cmd_line_get_param(cmd_line, "prefix", 0, 0)); - } else { - /* --enable-orterun-prefix-default was given to orterun */ - param = strdup(opal_install_dirs.prefix); - } - - if (NULL != param) { - /* "Parse" the param, aka remove superfluous path_sep. */ - param_len = strlen(param); - while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { - param[param_len-1] = '\0'; - param_len--; - if (0 == param_len) { - orte_show_help("help-orterun.txt", "orterun:empty-prefix", - true, orte_basename, orte_basename); - free(param); - return ORTE_ERR_FATAL; - } - } - - orte_cmd_line.prefix = param; - } - want_prefix_by_default = true; - } + /* should just be the uri itself - construct the target hnp info */ + opal_setenv(OPAL_MCA_PREFIX"orte_hnp_uri", orte_cmd_line.hnp, true, &environ); } /* Setup MCA params */ orte_register_params(); + /* we are never allowed to operate as a distributed tool, + * so insist on the ess/tool component */ + opal_setenv(OPAL_MCA_PREFIX"ess", "tool", true, &environ); + if (orte_cmd_line.debug) { orte_devel_level_output = true; } @@ -442,8 +500,7 @@ int orte_submit_init(int argc, char *argv[], * up incorrect infrastructure that only a singleton would * require */ - if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, - orte_process_info.proc_type))) { + if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_TOOL))) { /* cannot call ORTE_ERROR_LOG as it could be the errmgr * never got loaded! */ @@ -454,44 +511,33 @@ int orte_submit_init(int argc, char *argv[], */ opal_finalize(); - /* clear params from the environment so our children - * don't pick them up */ + /* clear the ess param from the environment so our children + * don't pick it up */ opal_unsetenv(OPAL_MCA_PREFIX"ess", &environ); - opal_unsetenv(OPAL_MCA_PREFIX"pmix", &environ); - - if (ORTE_PROC_IS_TOOL) { - /* set the info in our contact table */ - orte_rml.set_contact_info(orte_process_info.my_hnp_uri); - /* extract the name */ - if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { - orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); - exit(1); - } - /* set the route to be direct */ - if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) { - orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); - orte_finalize(); - exit(1); - } - /* set the target hnp as our lifeline so we will terminate if it exits */ - orte_routed.set_lifeline(ORTE_PROC_MY_HNP); - - /* setup to listen for HNP response to my commands */ - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFY_COMPLETE, - ORTE_RML_PERSISTENT, complete_recv, NULL); - orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP, - ORTE_RML_PERSISTENT, launch_recv, NULL); - } else { - /* save the environment for launch purposes. This MUST be - * done so that we can pass it to any local procs we - * spawn - otherwise, those local procs won't see any - * non-MCA envars were set in the enviro prior to calling - * orterun - */ - orte_launch_environ = opal_argv_copy(environ); + /* set the info in our contact table */ + orte_rml.set_contact_info(orte_process_info.my_hnp_uri); + /* extract the name */ + if (ORTE_SUCCESS != orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + exit(1); + } + /* set the route to be direct */ + if (ORTE_SUCCESS != orte_routed.update_route(ORTE_PROC_MY_HNP, ORTE_PROC_MY_HNP)) { + orte_show_help("help-orte-top.txt", "orte-top:hnp-uri-bad", true, orte_process_info.my_hnp_uri); + orte_finalize(); + exit(1); } + /* set the target hnp as our lifeline so we will terminate if it exits */ + orte_routed.set_lifeline(ORTE_PROC_MY_HNP); + + /* setup to listen for HNP response to my commands */ + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFY_COMPLETE, + ORTE_RML_PERSISTENT, complete_recv, NULL); + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP, + ORTE_RML_PERSISTENT, launch_recv, NULL); + return ORTE_SUCCESS; } @@ -587,19 +633,10 @@ int orte_submit_job(char *argv[], int *index, int rc; orte_daemon_cmd_flag_t cmd = ORTE_DAEMON_SPAWN_JOB_CMD; char *param; - orte_job_t *jdata = NULL, *daemons; - orte_app_context_t *app, *dapp; + orte_job_t *jdata = NULL; trackr_t *trk; int argc; - /* bozo check - we don't allow recursive calls of submit */ - if (NULL != getenv("OMPI_UNIVERSE_SIZE")) { - fprintf(stderr, "\n\n**********************************************************\n\n"); - fprintf(stderr, "%s does not support recursive calls\n", orte_basename); - fprintf(stderr, "\n**********************************************************\n"); - return ORTE_ERR_FATAL; - } - /* reset the globals every time thru as the argv * will modify them */ memset(&orte_cmd_line, 0, sizeof(orte_cmd_line)); @@ -619,6 +656,13 @@ int orte_submit_job(char *argv[], int *index, /* Check for some "global" command line params */ parse_globals(argc, argv, cmd_line); + /* default our personality to OMPI */ + if (NULL == orte_cmd_line.personality) { + opal_argv_append_nosize(&orte_cmd_line.personalities, "ompi"); + } else { + orte_cmd_line.personalities = opal_argv_split(orte_cmd_line.personality, ','); + } + /* create a new job object to hold the info for this one - the * jobid field will be filled in by the PLM when the job is * launched @@ -630,6 +674,7 @@ int orte_submit_job(char *argv[], int *index, */ return ORTE_ERR_OUT_OF_RESOURCE; } + jdata->personality = opal_argv_copy(orte_cmd_line.personalities); trk = OBJ_NEW(trackr_t); trk->jdata = jdata; trk->launch_cb = launch_cb; @@ -641,6 +686,12 @@ int orte_submit_job(char *argv[], int *index, /* pass our tracker ID */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ROOM_NUM, ORTE_ATTR_GLOBAL, &trk->index, OPAL_INT); + /* flag that we are using the DVM */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + /* flag that the allocation is static - i.e., the DVM is not allowed + * to be adjusted once started, and all unused nodes are to be + * removed from the node pool */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); /* check for stdout/err directives */ /* if we were asked to tag output, mark it so */ @@ -680,15 +731,6 @@ int orte_submit_job(char *argv[], int *index, /* Parse each app, adding it to the job object */ parse_locals(jdata, argc, argv); - if (0 == jdata->num_apps) { - /* This should never happen -- this case should be caught in - create_app(), but let's just double check... */ - orte_show_help("help-orterun.txt", "orterun:nothing-to-do", - true, orte_basename); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - return ORTE_ERR_FATAL; - } - /* create the map object to communicate policies */ jdata->map = OBJ_NEW(orte_job_map_t); @@ -741,17 +783,12 @@ int orte_submit_job(char *argv[], int *index, orte_set_attribute(&jdata->attributes, ORTE_JOB_SLOT_LIST, ORTE_ATTR_GLOBAL, orte_cmd_line.slot_list, OPAL_STRING); } - /* if recovery was disabled on the cmd line, do so */ - if (orte_cmd_line.enable_recovery) { - ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE); - } - - /* check for suicide test directives */ - if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || - NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { - /* don't forward IO from this process so we can - * see any debug after daemon termination */ - ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT); + if (0 == jdata->num_apps) { + /* This should never happen -- this case should be caught in + create_app(), but let's just double check... */ + orte_show_help("help-orterun.txt", "orterun:nothing-to-do", + true, orte_basename); + return ORTE_ERROR_DEFAULT_EXIT_CODE; } /* check for a job timeout specification, to be provided in seconds @@ -771,137 +808,31 @@ int orte_submit_job(char *argv[], int *index, opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv); } - if (ORTE_PROC_IS_HNP) { - /* get the daemon job object */ - daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); - - /* check for request to report uri */ - if (NULL != orte_cmd_line.report_uri) { - FILE *fp; - char *rml_uri; - rml_uri = orte_rml.get_contact_info(); - if (0 == strcmp(orte_cmd_line.report_uri, "-")) { - /* if '-', then output to stdout */ - printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); - } else if (0 == strcmp(orte_cmd_line.report_uri, "+")) { - /* if '+', output to stderr */ - fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); - } else { - fp = fopen(orte_cmd_line.report_uri, "w"); - if (NULL == fp) { - orte_show_help("help-orterun.txt", "orterun:write_file", false, - orte_basename, "uri", orte_cmd_line.report_uri); - exit(0); - } - fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); - fclose(fp); - } - if (NULL != rml_uri) { - free(rml_uri); - } - } - /* If we have a prefix, then modify the PATH and - LD_LIBRARY_PATH environment variables in our copy. This - will ensure that any locally-spawned children will - have our executables and libraries in their path - - For now, default to the prefix_dir provided in the first app_context. - Since there always MUST be at least one app_context, we are safe in - doing this. - */ - param = NULL; - if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) && - orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING)) { - char *oldenv, *newenv, *lib_base, *bin_base; - - /* copy the prefix into the daemon job so that any launcher - * can find the orteds when we launch the virtual machine - */ - if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) { - /* that's an error in the ess */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; - } - orte_set_attribute(&dapp->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_LOCAL, param, OPAL_STRING); - - lib_base = opal_basename(opal_install_dirs.libdir); - bin_base = opal_basename(opal_install_dirs.bindir); - - /* Reset PATH */ - newenv = opal_os_path( false, param, bin_base, NULL ); - oldenv = getenv("PATH"); - if (NULL != oldenv) { - char *temp; - asprintf(&temp, "%s:%s", newenv, oldenv ); - free( newenv ); - newenv = temp; - } - opal_setenv("PATH", newenv, true, &orte_launch_environ); - if (orte_debug_flag) { - opal_output(0, "%s: reset PATH: %s", orte_basename, newenv); - } - free(newenv); - free(bin_base); - - /* Reset LD_LIBRARY_PATH */ - newenv = opal_os_path( false, param, lib_base, NULL ); - oldenv = getenv("LD_LIBRARY_PATH"); - if (NULL != oldenv) { - char* temp; - asprintf(&temp, "%s:%s", newenv, oldenv); - free(newenv); - newenv = temp; - } - opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ); - if (orte_debug_flag) { - opal_output(0, "%s: reset LD_LIBRARY_PATH: %s", - orte_basename, newenv); - } - free(newenv); - free(lib_base); - free(param); - } + /* if recovery was disabled on the cmd line, do so */ + if (orte_cmd_line.enable_recovery) { + ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_RECOVERABLE); + } - /* pre-condition any network transports that require it */ - if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) { - ORTE_ERROR_LOG(rc); - orte_show_help("help-orterun.txt", "orterun:precondition", false, - orte_basename, NULL, NULL, rc); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - return rc; - } - /* setup for debugging */ - orte_debugger_init_before_spawn(jdata); + // pack the ORTE_DAEMON_SPAWN_JOB_CMD command and job object and send to HNP at tag ORTE_RML_TAG_DAEMON + req = OBJ_NEW(opal_buffer_t); + if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &jdata, 1, ORTE_JOB))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &trk->index, 1, OPAL_INT))) { + ORTE_ERROR_LOG(rc); + return rc; + } - rc = orte_plm.spawn(jdata); - } else { - /* flag that we are using the DVM */ - orte_set_attribute(&jdata->attributes, ORTE_JOB_DVM_JOB, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - /* flag that the allocation is static - i.e., the DVM is not allowed - * to be adjusted once started, and all unused nodes are to be - * removed from the node pool */ - orte_set_attribute(&jdata->attributes, ORTE_JOB_FIXED_DVM, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); - // pack the ORTE_DAEMON_SPAWN_JOB_CMD command and job object and send to HNP at tag ORTE_RML_TAG_DAEMON - req = OBJ_NEW(opal_buffer_t); - if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &cmd, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &jdata, 1, ORTE_JOB))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (OPAL_SUCCESS != (rc = opal_dss.pack(req, &trk->index, 1, OPAL_INT))) { - ORTE_ERROR_LOG(rc); - return rc; - } - orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL); + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, req, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL); - /* Inform the caller of the tracker index if they passed a index pointer */ - if (NULL != index) { - *index = trk->index; - } - } + /* Inform the caller of the tracker index if they passed a index pointer */ + if (NULL != index) + *index = trk->index; return ORTE_SUCCESS; @@ -959,12 +890,6 @@ static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) } } - /* Do we want a user-level debugger? */ - - if (orte_cmd_line.debugger) { - run_debugger(orte_basename, cmd_line, argc, argv, orte_cmd_line.num_procs); - } - return ORTE_SUCCESS; } @@ -1146,7 +1071,7 @@ static int create_app(int argc, char* argv[], orte_app_context_t **app_ptr, bool *made_app, char ***app_env) { - opal_cmd_line_t app_cmd_line; + opal_cmd_line_t cmd_line; char cwd[OPAL_PATH_MAX]; int i, j, count, rc; char *param, *value; @@ -1168,28 +1093,31 @@ static int create_app(int argc, char* argv[], * Only pick up '-mca foo bar' on this pass. */ if (NULL != orte_cmd_line.appfile) { - if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(argc, 0, argv))) { + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, argc, 0, argv))) { goto cleanup; } } /* Parse application command line options. */ init_globals(); - OBJ_CONSTRUCT(&app_cmd_line, opal_cmd_line_t); - rc = orte_cmd_line_create(&app_cmd_line, argc, argv, - app_env, &global_mca_env, - NULL, NULL); + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); cmd_line_made = true; + rc = opal_cmd_line_parse(&cmd_line, true, argc, argv); + if (ORTE_SUCCESS != rc) { + goto cleanup; + } + mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env); /* Is there an appfile in here? */ if (NULL != orte_cmd_line.appfile) { - OBJ_DESTRUCT(&app_cmd_line); + OBJ_DESTRUCT(&cmd_line); return parse_appfile(jdata, strdup(orte_cmd_line.appfile), app_env); } /* Setup application context */ app = OBJ_NEW(orte_app_context_t); - opal_cmd_line_get_tail(&app_cmd_line, &count, &app->argv); + opal_cmd_line_get_tail(&cmd_line, &count, &app->argv); /* See if we have anything left */ if (0 == count) { @@ -1206,15 +1134,16 @@ static int create_app(int argc, char* argv[], * mpirun -np 2 -mca foo bar ./my-app -mca bip bop * We want to pick up '-mca foo bar' but not '-mca bip bop' */ - if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(argc, count, argv))) { + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, argc, count, argv))) { goto cleanup; } /* Grab all MCA environment variables */ app->env = opal_argv_copy(*app_env); - if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.path, - &app_cmd_line, + if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.personalities, + orte_cmd_line.path, + &cmd_line, environ, &app->env))) { goto cleanup; } @@ -1258,20 +1187,20 @@ static int create_app(int argc, char* argv[], /* Check to see if the user explicitly wanted to disable automatic --prefix behavior */ - if (opal_cmd_line_is_taken(&app_cmd_line, "noprefix")) { + if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) { want_prefix_by_default = false; } /* Did the user specify a prefix, or want prefix by default? */ - if (opal_cmd_line_is_taken(&app_cmd_line, "prefix") || want_prefix_by_default) { + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { size_t param_len; /* if both the prefix was given and we have a prefix * given above, check to see if they match */ - if (opal_cmd_line_is_taken(&app_cmd_line, "prefix") && + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && NULL != orte_cmd_line.prefix) { /* if they don't match, then that merits a warning */ - param = strdup(opal_cmd_line_get_param(&app_cmd_line, "prefix", 0, 0)); + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); /* ensure we strip any trailing '/' */ if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { param[strlen(param)-1] = '\0'; @@ -1292,9 +1221,9 @@ static int create_app(int argc, char* argv[], free(value); } else if (NULL != orte_cmd_line.prefix) { param = strdup(orte_cmd_line.prefix); - } else if (opal_cmd_line_is_taken(&app_cmd_line, "prefix")){ + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ /* must be --prefix alone */ - param = strdup(opal_cmd_line_get_param(&app_cmd_line, "prefix", 0, 0)); + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); } else { /* --enable-orterun-prefix-default was given to orterun */ param = strdup(opal_install_dirs.prefix); @@ -1323,32 +1252,32 @@ static int create_app(int argc, char* argv[], * hostfile and machine file. * We can only deal with one hostfile per app context, otherwise give an error. */ - if (0 < (j = opal_cmd_line_get_ninsts(&app_cmd_line, "hostfile"))) { + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { if(1 < j) { orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", true, orte_basename, NULL); return ORTE_ERR_FATAL; } else { - value = opal_cmd_line_get_param(&app_cmd_line, "hostfile", 0, 0); + value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_GLOBAL, value, OPAL_STRING); } } - if (0 < (j = opal_cmd_line_get_ninsts(&app_cmd_line, "machinefile"))) { + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) { orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", true, orte_basename, NULL); return ORTE_ERR_FATAL; } else { - value = opal_cmd_line_get_param(&app_cmd_line, "machinefile", 0, 0); + value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_GLOBAL, value, OPAL_STRING); } } /* Did the user specify any hosts? */ - if (0 < (j = opal_cmd_line_get_ninsts(&app_cmd_line, "host"))) { + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { char **targ=NULL, *tval; for (i = 0; i < j; ++i) { - value = opal_cmd_line_get_param(&app_cmd_line, "host", i, 0); + value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); opal_argv_append_nosize(&targ, value); } tval = opal_argv_join(targ, ','); @@ -1566,7 +1495,7 @@ static int create_app(int argc, char* argv[], OBJ_RELEASE(app); } if (cmd_line_made) { - OBJ_DESTRUCT(&app_cmd_line); + OBJ_DESTRUCT(&cmd_line); } return rc; } @@ -1725,6 +1654,20 @@ static int parse_appfile(orte_job_t *jdata, char *filename, char ***env) return ORTE_SUCCESS; } +void orte_timeout_wakeup(int sd, short args, void *cbdata) +{ + char *tm; + + /* this function gets called when the job execution time + * has hit a prescribed limit - so just abort + */ + tm = getenv("MPIEXEC_TIMEOUT"); + orte_show_help("help-orterun.txt", "orterun:timeout", + true, (NULL == tm) ? "NULL" : tm); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + exit(orte_exit_status); +} + static void launch_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void *cbdata) @@ -1865,880 +1808,3 @@ static void complete_recv(int status, orte_process_name_t* sender, opal_pointer_array_set_item(&tool_jobs, tool_job_index, NULL); OBJ_RELEASE(trk); } - - -/**** DEBUGGER CODE ****/ -/* - * Debugger support for orterun - * - * We interpret the MPICH debugger interface as follows: - * - * a) The launcher - * - spawns the other processes, - * - fills in the table MPIR_proctable, and sets MPIR_proctable_size - * - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1) - * - calls MPIR_Breakpoint() which the debugger will have a - * breakpoint on. - * - * b) Applications start and then spin until MPIR_debug_gate is set - * non-zero by the debugger. - * - * This file implements (a). - * - ************************************************************************** - * - * Note that we have presently tested both TotalView and DDT parallel - * debuggers. They both nominally subscribe to the Etnus attaching - * interface, but there are differences between the two. - * - * TotalView: user launches "totalview mpirun -a ......". - * TV launches mpirun. mpirun launches the application and then calls - * MPIR_Breakpoint(). This is the signal to TV that it's a parallel - * MPI job. TV then reads the proctable in mpirun and attaches itself - * to all the processes (it takes care of launching itself on the - * remote nodes). Upon attaching to all the MPI processes, the - * variable MPIR_being_debugged is set to 1. When it has finished - * attaching itself to all the MPI processes that it wants to, - * MPIR_Breakpoint() returns. - * - * DDT: user launches "ddt bin -np X ". DDT fork/exec's - * mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np - * X ddt-debugger" (not the lack of other arguments -- we can't pass - * anything to mpirun). This app will eventually fork/exec the MPI - * app. DDT does not current set MPIR_being_debugged in the MPI app. - * - ************************************************************************** - * - * We support two ways of waiting for attaching debuggers. The - * implementation spans this file and ompi/debuggers/ompi_debuggers.c. - * - * 1. If using orterun: MPI processes will have the - * orte_in_parallel_debugger MCA param set to true (because not all - * debuggers consistently set MPIR_being_debugged in both the launcher - * and in the MPI procs). The HNP will call MPIR_Breakpoint() and - * then RML send a message to VPID 0 (MCW rank 0) when it returns - * (MPIR_Breakpoint() doesn't return until the debugger has attached - * to all relevant processes). Meanwhile, VPID 0 blocks waiting for - * the RML message. All other VPIDs immediately call the grpcomm - * barrier (and therefore block until the debugger attaches). Once - * VPID 0 receives the RML message, we know that the debugger has - * attached to all processes that it cares about, and VPID 0 then - * joins the grpcomm barrier, allowing the job to continue. This - * scheme has the side effect of nicely supporting partial attaches by - * parallel debuggers (i.e., attaching to only some of the MPI - * processes; not necessarily all of them). - * - * 2. If not using orterun: in this case, we know that there will not be an RML message - * sent to VPID 0. So we have to look for a magic environment - * variable from the launcher to know if the jobs will be attached by - * a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on - * MPIR_debug_gate. These environment variable names must be - * hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c). - */ - -/* local globals and functions */ -#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X); -#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) - -struct MPIR_PROCDESC { - char *host_name; /* something that can be passed to inet_addr */ - char *executable_name; /* name of binary */ - int pid; /* process pid */ -}; - - -/** - * Initialization of data structures for running under a debugger - * using the MPICH/TotalView parallel debugger interface. Before the - * spawn we need to check if we are being run under a TotalView-like - * debugger; if so then inform applications via an MCA parameter. - */ -static void orte_debugger_init_before_spawn(orte_job_t *jdata) -{ - char *env_name; - orte_app_context_t *app; - int i; - char *attach_fifo; - - if (!MPIR_being_debugged && !orte_in_parallel_debugger) { - /* if we were given a test debugger, then we still want to - * colaunch it - */ - if (NULL != orte_debugger_test_daemon) { - opal_output_verbose(2, orte_debug_output, - "%s No debugger test daemon specified", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - goto launchit; - } - /* if we were given an auto-detect rate, then we want to setup - * an event so we periodically do the check - */ - if (0 < orte_debugger_check_rate) { - opal_output_verbose(2, orte_debug_output, - "%s Setting debugger attach check rate for %d seconds", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - orte_debugger_check_rate); - ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger, ORTE_SYS_PRI); - } else if (orte_create_session_dirs) { - /* create the attachment FIFO and setup readevent - cannot be - * done if no session dirs exist! - */ - attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL); - if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) { - opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno); - free(attach_fifo); - return; - } - strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1); - free(attach_fifo); - open_fifo(); - } - return; - } - - launchit: - opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger"); - - /* tell the procs they are being debugged */ - (void) mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); - - for (i=0; i < jdata->apps->size; i++) { - if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { - continue; - } - opal_setenv(env_name, "1", true, &app->env); - } - free(env_name); -} - -static bool mpir_breakpoint_fired = false; - -static void orte_debugger_dump(void) -{ - int i; - - DUMP_INT(MPIR_being_debugged); - DUMP_INT(MPIR_debug_state); - DUMP_INT(MPIR_partial_attach_ok); - DUMP_INT(MPIR_i_am_starter); - DUMP_INT(MPIR_forward_output); - DUMP_INT(MPIR_proctable_size); - fprintf(stderr, " MPIR_proctable:\n"); - for (i = 0; i < MPIR_proctable_size; i++) { - fprintf(stderr, - " (i, host, exe, pid) = (%d, %s, %s, %d)\n", - i, - MPIR_proctable[i].host_name, - MPIR_proctable[i].executable_name, - MPIR_proctable[i].pid); - } - fprintf(stderr, "MPIR_executable_path: %s\n", - ('\0' == MPIR_executable_path[0]) ? - "NULL" : (char*) MPIR_executable_path); - fprintf(stderr, "MPIR_server_arguments: %s\n", - ('\0' == MPIR_server_arguments[0]) ? - "NULL" : (char*) MPIR_server_arguments); -} - -static void setup_debugger_job(void) -{ - orte_job_t *debugger; - orte_app_context_t *app; - orte_proc_t *proc; - int i, rc; - orte_node_t *node; - orte_vpid_t vpid=0; - char cwd[OPAL_PATH_MAX]; - - /* setup debugger daemon job */ - debugger = OBJ_NEW(orte_job_t); - /* create a jobid for these daemons - this is done solely - * to avoid confusing the rest of the system's bookkeeping - */ - orte_plm_base_create_jobid(debugger); - /* set the personality to ORTE */ - opal_argv_append_nosize(&debugger->personality, "orte"); - /* flag the job as being debugger daemons */ - ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_DEBUGGER_DAEMON); - /* unless directed, we do not forward output */ - if (!MPIR_forward_output) { - ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_FORWARD_OUTPUT); - } - /* dont push stdin */ - debugger->stdin_target = ORTE_VPID_INVALID; - /* add it to the global job pool */ - opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger); - /* create an app_context for the debugger daemon */ - app = OBJ_NEW(orte_app_context_t); - if (NULL != orte_debugger_test_daemon) { - app->app = strdup(orte_debugger_test_daemon); - } else { - app->app = strdup((char*)MPIR_executable_path); - } - /* don't currently have an option to pass the debugger - * cwd - probably should add one someday - */ - if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { - orte_show_help("help-orterun.txt", "orterun:init-failure", - true, "get the cwd", rc); - return; - } - app->cwd = strdup(cwd); - orte_remove_attribute(&app->attributes, ORTE_APP_USER_CWD); - opal_argv_append_nosize(&app->argv, app->app); - build_debugger_args(app); - opal_pointer_array_add(debugger->apps, app); - debugger->num_apps = 1; - /* create a job map */ - debugger->map = OBJ_NEW(orte_job_map_t); - /* in building the map, we want to launch one debugger daemon - * on each node that *already has an application process on it*. - * We cannot just launch one debugger daemon on EVERY node because - * the original job may not have placed procs on every node. So - * we construct the map here by cycling across all nodes, adding - * only those nodes where num_procs > 0. - */ - for (i=0; i < orte_node_pool->size; i++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { - continue; - } - /* if this node wasn't included in the vm, ignore it */ - if (NULL == node->daemon) { - continue; - } - /* if the node doesn't have any app procs on it, ignore it */ - if (node->num_procs < 1) { - continue; - } - /* this node has at least one proc, so add it to our map */ - OBJ_RETAIN(node); - opal_pointer_array_add(debugger->map->nodes, node); - debugger->map->num_nodes++; - /* add a debugger daemon to the node - note that the - * debugger daemon does NOT count against our subscribed slots - */ - proc = OBJ_NEW(orte_proc_t); - proc->name.jobid = debugger->jobid; - proc->name.vpid = vpid++; - /* set the local/node ranks - we don't actually care - * what these are, but the odls needs them - */ - proc->local_rank = 0; - proc->node_rank = 0; - proc->app_rank = proc->name.vpid; - /* flag the proc as ready for launch */ - proc->state = ORTE_PROC_STATE_INIT; - proc->app_idx = 0; - - OBJ_RETAIN(node); /* maintain accounting on object */ - proc->node = node; - /* add the proc to the job */ - opal_pointer_array_set_item(debugger->procs, proc->name.vpid, proc); - debugger->num_procs++; - - /* add the proc to the node's array */ - OBJ_RETAIN(proc); - opal_pointer_array_add(node->procs, (void*)proc); - node->num_procs++; - } - /* schedule it for launch */ - debugger->state = ORTE_JOB_STATE_INIT; - ORTE_ACTIVATE_JOB_STATE(debugger, ORTE_JOB_STATE_LAUNCH_APPS); -} - -/* - * Initialization of data structures for running under a debugger - * using the MPICH/TotalView parallel debugger interface. This stage - * of initialization must occur after spawn - * - * NOTE: We -always- perform this step to ensure that any debugger - * that attaches to us post-launch of the application can get a - * completed proctable - */ -void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) -{ - orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - orte_job_t *jdata = caddy->jdata; - orte_proc_t *proc; - orte_app_context_t *appctx; - orte_vpid_t i, j; - opal_buffer_t *buf; - int rc; - char **aliases, *aptr; - - /* if we couldn't get thru the mapper stage, we might - * enter here with no procs. Avoid the "zero byte malloc" - * message by checking here - */ - if (MPIR_proctable || 0 == jdata->num_procs) { - /* already initialized */ - opal_output_verbose(5, orte_debug_output, - "%s: debugger already initialized or zero procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - OBJ_RELEASE(caddy); - if (!mpir_breakpoint_fired) { - /* record that we have triggered the debugger */ - mpir_breakpoint_fired = true; - - /* trigger the debugger */ - MPIR_Breakpoint(); - - /* send a message to rank=0 to release it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || - ORTE_PROC_STATE_UNTERMINATED < proc->state ) { - /* proc is already dead */ - return; - } - buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ - if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, - ORTE_RML_TAG_DEBUGGER_RELEASE, - orte_rml_send_callback, NULL))) { - opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); - OBJ_RELEASE(buf); - } - } - return; - } - - /* fill in the proc table for the application processes */ - - opal_output_verbose(5, orte_debug_output, - "%s: Setting up debugger process table for applications", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - - MPIR_debug_state = 1; - - /* set the total number of processes in the job */ - MPIR_proctable_size = jdata->num_procs; - - /* allocate MPIR_proctable */ - MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) * - MPIR_proctable_size); - if (MPIR_proctable == NULL) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - OBJ_RELEASE(caddy); - return; - } - - if (orte_debugger_dump_proctable) { - opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - } - - /* initialize MPIR_proctable */ - for (j=0; j < jdata->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - /* store this data in the location whose index - * corresponds to the proc's rank - */ - i = proc->name.vpid; - if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { - continue; - } - - /* take the indicated alias as the hostname, if aliases exist */ - if (orte_retain_aliases) { - aliases = NULL; - aptr = NULL; - if (orte_get_attribute(&proc->node->attributes, ORTE_NODE_ALIAS, (void**)&aptr, OPAL_STRING)) { - aliases = opal_argv_split(aptr, ','); - free(aptr); - if (orte_use_hostname_alias <= opal_argv_count(aliases)) { - MPIR_proctable[i].host_name = strdup(aliases[orte_use_hostname_alias-1]); - } - opal_argv_free(aliases); - } - } else { - /* just use the default name */ - MPIR_proctable[i].host_name = strdup(proc->node->name); - } - - if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { - MPIR_proctable[i].executable_name = - opal_os_path( false, appctx->app, NULL ); - } else { - MPIR_proctable[i].executable_name = - opal_os_path( false, appctx->cwd, appctx->app, NULL ); - } - MPIR_proctable[i].pid = proc->pid; - if (orte_debugger_dump_proctable) { - opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", - ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, - MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); - } - } - - if (0 < opal_output_get_verbosity(orte_debug_output)) { - orte_debugger_dump(); - } - - /* if we are being launched under a debugger, then we must wait - * for it to be ready to go and do some things to start the job - */ - if (MPIR_being_debugged || NULL != orte_debugger_test_daemon || - NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { - /* if we are not launching debugger daemons, then trigger - * the debugger - otherwise, we need to wait for the debugger - * daemons to be started - */ - if ('\0' == MPIR_executable_path[0] && NULL == orte_debugger_test_daemon) { - /* record that we have triggered the debugger */ - mpir_breakpoint_fired = true; - - /* trigger the debugger */ - MPIR_Breakpoint(); - - /* send a message to rank=0 to release it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || - ORTE_PROC_STATE_UNTERMINATED < proc->state) { - /* proc is already dead or never registered with us (so we don't have - * contact info for him) - */ - return; - } - opal_output_verbose(2, orte_debug_output, - "%s sending debugger release to %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name)); - buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ - if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, - ORTE_RML_TAG_DEBUGGER_RELEASE, - orte_rml_send_callback, NULL))) { - opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); - OBJ_RELEASE(buf); - } - } else { - /* if I am launching debugger daemons, then I need to do so now - * that the job has been started and I know which nodes have - * apps on them - */ - opal_output_verbose(2, orte_debug_output, - "%s Cospawning debugger daemons %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == orte_debugger_test_daemon) ? - MPIR_executable_path : orte_debugger_test_daemon); - setup_debugger_job(); - } - /* we don't have anything else to do */ - OBJ_RELEASE(caddy); - return; - } - - /* if we are not being debugged, then just cleanup and depart */ - OBJ_RELEASE(caddy); -} - -/* - * Process one line from the orte_base_user_debugger MCA param and - * look for that debugger in the path. If we find it, fill in - * new_argv. - */ -static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line, - int argc, char **argv, char ***new_argv, int num_procs) -{ - int ret = ORTE_SUCCESS; - int i, j, count; - char *line = NULL, *tmp = NULL, *full_line = strdup(orig_line); - char **orterun_argv = NULL, **executable_argv = NULL, **line_argv = NULL; - char cwd[OPAL_PATH_MAX]; - bool used_num_procs = false; - bool single_app = false; - bool fail_needed_executable = false; - - line = full_line; - if (NULL == line) { - ret = ORTE_ERR_OUT_OF_RESOURCE; - goto out; - } - - /* Trim off whitespace at the beginning and ending of line */ - - for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) { - continue; - } - for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) { - line[i] = '\0'; - } - if (strlen(line) <= 0) { - ret = ORTE_ERROR; - goto out; - } - - /* Get the tail of the command line (i.e., the user executable / - argv) */ - - opal_cmd_line_get_tail(cmd_line, &i, &executable_argv); - - /* Make a new copy of the orterun command line args, without the - orterun token itself, and without the --debug, --debugger, and - -tv flags. */ - - orterun_argv = opal_argv_copy(argv); - count = opal_argv_count(orterun_argv); - opal_argv_delete(&count, &orterun_argv, 0, 1); - for (i = 0; NULL != orterun_argv[i]; ++i) { - count = opal_argv_count(orterun_argv); - if (0 == strcmp(orterun_argv[i], "-debug") || - 0 == strcmp(orterun_argv[i], "--debug")) { - opal_argv_delete(&count, &orterun_argv, i, 1); - } else if (0 == strcmp(orterun_argv[i], "-tv") || - 0 == strcmp(orterun_argv[i], "--tv")) { - opal_argv_delete(&count, &orterun_argv, i, 1); - } else if (0 == strcmp(orterun_argv[i], "--debugger") || - 0 == strcmp(orterun_argv[i], "-debugger")) { - opal_argv_delete(&count, &orterun_argv, i, 2); - } - } - - /* Replace @@ tokens - line should never realistically be bigger - than MAX_INT, so just cast to int to remove compiler warning */ - - *new_argv = NULL; - line_argv = opal_argv_split(line, ' '); - if (NULL == line_argv) { - ret = ORTE_ERR_NOT_FOUND; - goto out; - } - for (i = 0; NULL != line_argv[i]; ++i) { - if (0 == strcmp(line_argv[i], "@mpirun@") || - 0 == strcmp(line_argv[i], "@orterun@")) { - opal_argv_append_nosize(new_argv, argv[0]); - } else if (0 == strcmp(line_argv[i], "@mpirun_args@") || - 0 == strcmp(line_argv[i], "@orterun_args@")) { - for (j = 0; NULL != orterun_argv && NULL != orterun_argv[j]; ++j) { - opal_argv_append_nosize(new_argv, orterun_argv[j]); - } - } else if (0 == strcmp(line_argv[i], "@np@")) { - used_num_procs = true; - asprintf(&tmp, "%d", num_procs); - opal_argv_append_nosize(new_argv, tmp); - free(tmp); - } else if (0 == strcmp(line_argv[i], "@single_app@")) { - /* This token is only a flag; it is not replaced with any - alternate text */ - single_app = true; - } else if (0 == strcmp(line_argv[i], "@executable@")) { - /* If we found the executable, paste it in. Otherwise, - this is a possible error. */ - if (NULL != executable_argv) { - opal_argv_append_nosize(new_argv, executable_argv[0]); - } else { - fail_needed_executable = true; - } - } else if (0 == strcmp(line_argv[i], "@executable_argv@")) { - /* If we found the tail, paste in the argv. Otherwise, - this is a possible error. */ - if (NULL != executable_argv) { - for (j = 1; NULL != executable_argv[j]; ++j) { - opal_argv_append_nosize(new_argv, executable_argv[j]); - } - } else { - fail_needed_executable = true; - } - } else { - /* It wasn't a special token, so just copy it over */ - opal_argv_append_nosize(new_argv, line_argv[i]); - } - } - - /* Can we find argv[0] in the path? */ - - getcwd(cwd, OPAL_PATH_MAX); - tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd); - if (NULL != tmp) { - free(tmp); - - /* Ok, we found a good debugger. Check for some error - conditions. */ - tmp = opal_argv_join(argv, ' '); - - /* We do not support launching a debugger that requires the - -np value if the user did not specify -np on the command - line. */ - if (used_num_procs && 0 == num_procs) { - free(tmp); - tmp = opal_argv_join(orterun_argv, ' '); - orte_show_help("help-orterun.txt", "debugger requires -np", - true, (*new_argv)[0], argv[0], tmp, - (*new_argv)[0]); - /* Fall through to free / fail, below */ - } - - /* Some debuggers do not support launching MPMD */ - else if (single_app && NULL != strstr(tmp, " : ")) { - orte_show_help("help-orterun.txt", - "debugger only accepts single app", true, - (*new_argv)[0], (*new_argv)[0]); - /* Fall through to free / fail, below */ - } - - /* Some debuggers do not use orterun/mpirun, and therefore - must have an executable to run (e.g., cannot use mpirun's - app context file feature). */ - else if (fail_needed_executable) { - orte_show_help("help-orterun.txt", - "debugger requires executable", true, - (*new_argv)[0], argv[0], (*new_argv)[0], argv[0], - (*new_argv)[0]); - /* Fall through to free / fail, below */ - } - - /* Otherwise, we succeeded. Return happiness. */ - else { - goto out; - } - } - - /* All done -- didn't find it */ - - opal_argv_free(*new_argv); - *new_argv = NULL; - ret = ORTE_ERR_NOT_FOUND; - - out: - if (NULL != orterun_argv) { - opal_argv_free(orterun_argv); - } - if (NULL != executable_argv) { - opal_argv_free(executable_argv); - } - if (NULL != line_argv) { - opal_argv_free(line_argv); - } - if (NULL != tmp) { - free(tmp); - } - if (NULL != full_line) { - free(full_line); - } - return ret; -} - -static void open_fifo (void) -{ - if (orte_debugger_attach_fd > 0) { - close(orte_debugger_attach_fd); - } - - orte_debugger_attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0); - if (orte_debugger_attach_fd < 0) { - opal_output(0, "%s unable to open debugger attach fifo", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - /* Set this fd to be close-on-exec so that children don't see it */ - if (opal_fd_set_cloexec(orte_debugger_attach_fd) != OPAL_SUCCESS) { - opal_output(0, "%s unable to set debugger attach fifo to CLOEXEC", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - close(orte_debugger_attach_fd); - orte_debugger_attach_fd = -1; - return; - } - - opal_output_verbose(2, orte_debug_output, - "%s Monitoring debugger attach fifo %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - MPIR_attach_fifo); - orte_debugger_attach = (opal_event_t*)malloc(sizeof(opal_event_t)); - opal_event_set(orte_event_base, orte_debugger_attach, orte_debugger_attach_fd, - OPAL_EV_READ, attach_debugger, orte_debugger_attach); - - orte_debugger_fifo_active = true; - opal_event_add(orte_debugger_attach, 0); -} - -static void attach_debugger(int fd, short event, void *arg) -{ - unsigned char fifo_cmd; - int rc; - orte_timer_t *tm; - - if (orte_debugger_fifo_active) { - orte_debugger_attach = (opal_event_t*)arg; - orte_debugger_fifo_active = false; - - rc = read(orte_debugger_attach_fd, &fifo_cmd, sizeof(fifo_cmd)); - if (!rc) { - /* release the current event */ - opal_event_free(orte_debugger_attach); - /* reopen device to clear hangup */ - open_fifo(); - return; - } - if (1 != fifo_cmd) { - /* ignore the cmd */ - orte_debugger_fifo_active = true; - opal_event_add(orte_debugger_attach, 0); - return; - } - } - - if (!MPIR_being_debugged && !orte_debugger_test_attach) { - /* false alarm - reset the read or timer event */ - if (0 == orte_debugger_check_rate) { - orte_debugger_fifo_active = true; - opal_event_add(orte_debugger_attach, 0); - } else if (!MPIR_being_debugged) { - tm = (orte_timer_t*)arg; - /* re-add the event */ - opal_event_evtimer_add(tm->ev, &tm->tv); - } - return; - } - - opal_output_verbose(1, orte_debug_output, - "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); - - /* a debugger has attached! All the MPIR_Proctable - * data is already available, so we only need to - * check to see if we should spawn any daemons - */ - if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) { - opal_output_verbose(2, orte_debug_output, - "%s Spawning debugger daemons %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - (NULL == orte_debugger_test_daemon) ? - MPIR_executable_path : orte_debugger_test_daemon); - setup_debugger_job(); - } - - /* reset the read or timer event */ - if (0 == orte_debugger_check_rate) { - orte_debugger_fifo_active = true; - opal_event_add(orte_debugger_attach, 0); - } else if (!MPIR_being_debugged) { - tm = (orte_timer_t*)arg; - /* re-add the event */ - opal_event_evtimer_add(tm->ev, &tm->tv); - } -} - -static void build_debugger_args(orte_app_context_t *debugger) -{ - int i, j; - char mpir_arg[MPIR_MAX_ARG_LENGTH]; - - if ('\0' != MPIR_server_arguments[0]) { - j=0; - memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); - for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) { - if (MPIR_server_arguments[i] == '\0') { - if (0 < j) { - opal_argv_append_nosize(&debugger->argv, mpir_arg); - memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); - j=0; - } - } else { - mpir_arg[j] = MPIR_server_arguments[i]; - j++; - } - } - } -} - -/** - * Run a user-level debugger - */ -static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, - int argc, char *argv[], int num_procs) -{ - int i, id, ret; - char **new_argv = NULL; - const char **tmp; - char *value, **lines, *env_name; - - /* Get the orte_base_debug MCA parameter and search for a debugger - that can run */ - - id = mca_base_var_find("orte", "orte", NULL, "base_user_debugger"); - if (id < 0) { - orte_show_help("help-orterun.txt", "debugger-mca-param-not-found", - true); - exit(1); - } - - ret = mca_base_var_get_value (id, &tmp, NULL, NULL); - if (OPAL_SUCCESS != ret || NULL == tmp || NULL == tmp[0]) { - orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty", - true); - exit(1); - } - - /* Look through all the values in the MCA param */ - - lines = opal_argv_split(tmp[0], ':'); - for (i = 0; NULL != lines[i]; ++i) { - if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv, - &new_argv, num_procs)) { - break; - } - } - - /* If we didn't find one, abort */ - - if (NULL == lines[i]) { - orte_show_help("help-orterun.txt", "debugger-not-found", true); - exit(1); - } - opal_argv_free(lines); - - /* We found one */ - - /* cleanup the MPIR arrays in case the debugger doesn't set them */ - memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH); - memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH); - - /* Set an MCA param so that everyone knows that they are being - launched under a debugger; not all debuggers are consistent - about setting MPIR_being_debugged in both the launcher and the - MPI processes */ - ret = mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); - if (OPAL_SUCCESS == ret && NULL != env_name) { - opal_setenv(env_name, "1", true, &environ); - free(env_name); - } - - /* Launch the debugger */ - execvp(new_argv[0], new_argv); - value = opal_argv_join(new_argv, ' '); - orte_show_help("help-orterun.txt", "debugger-exec-failed", - true, basename, value, new_argv[0]); - free(value); - opal_argv_free(new_argv); - exit(1); -} - -void orte_debugger_detached(int fd, short event, void *cbdata) -{ - orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; - OBJ_RELEASE(caddy); - - /* need to ensure MPIR_Breakpoint is called again if another debugger attaches */ - mpir_breakpoint_fired = false; -} - -void orte_timeout_wakeup(int sd, short args, void *cbdata) -{ - char *tm; - - /* this function gets called when the job execution time - * has hit a prescribed limit - so just abort - */ - tm = getenv("MPIEXEC_TIMEOUT"); - orte_show_help("help-orterun.txt", "orterun:timeout", - true, (NULL == tm) ? "NULL" : tm); - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - /* if we are testing HNP suicide, then just exit */ - if (ORTE_PROC_IS_HNP && - NULL != getenv("ORTE_TEST_HNP_SUICIDE")) { - opal_output(0, "HNP exiting w/o cleanup"); - exit(1); - } - /* abort the job */ - ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); - /* set the global abnormal exit flag */ - orte_abnormal_term_ordered = true; -} diff --git a/orte/orted/orted_submit.h b/orte/orted/orted_submit.h index ed6a8313d43..23eead97941 100644 --- a/orte/orted/orted_submit.h +++ b/orte/orted/orted_submit.h @@ -28,13 +28,6 @@ ORTE_DECLSPEC int orte_submit_job(char *cmd[], int *index, orte_submit_cbfunc_t launch_cb, void *launch_cbdata, orte_submit_cbfunc_t complete_cb, void *complete_cbdata); ORTE_DECLSPEC int orte_submit_halt(void); -ORTE_DECLSPEC void orte_debugger_init_after_spawn(int fd, short event, void *arg); -ORTE_DECLSPEC void orte_debugger_detached(int fd, short event, void *arg); - -extern int orte_debugger_attach_fd; -extern bool orte_debugger_fifo_active; -extern opal_event_t *orte_debugger_attach; -extern char MPIR_attach_fifo[]; /** * Global struct for catching orte command line options. @@ -89,7 +82,6 @@ struct orte_cmd_line_t { bool merge; bool enable_recovery; char *hnp; - bool staged_exec; }; typedef struct orte_cmd_line_t orte_cmd_line_t; ORTE_DECLSPEC extern orte_cmd_line_t orte_cmd_line; diff --git a/orte/orted/pmix/pmix_server_dyn.c b/orte/orted/pmix/pmix_server_dyn.c index 9c800410ff1..907208329d1 100644 --- a/orte/orted/pmix/pmix_server_dyn.c +++ b/orte/orted/pmix/pmix_server_dyn.c @@ -169,17 +169,25 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, /* create the job object */ jdata = OBJ_NEW(orte_job_t); - jdata->map = OBJ_NEW(orte_job_map_t); /* transfer the job info across */ OPAL_LIST_FOREACH(info, job_info, opal_value_t) { if (0 == strcmp(info->key, OPAL_PMIX_PERSONALITY)) { jdata->personality = opal_argv_split(info->data.string, ','); } else if (0 == strcmp(info->key, OPAL_PMIX_MAPPER)) { + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } jdata->map->req_mapper = strdup(info->data.string); } else if (0 == strcmp(info->key, OPAL_PMIX_DISPLAY_MAP)) { + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } jdata->map->display_map = true; } else if (0 == strcmp(info->key, OPAL_PMIX_PPR)) { + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { /* not allowed to provide multiple mapping policies */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", @@ -190,6 +198,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_PPR); jdata->map->ppr = strdup(info->data.string); } else if (0 == strcmp(info->key, OPAL_PMIX_MAPBY)) { + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } if (ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { /* not allowed to provide multiple mapping policies */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", @@ -203,6 +214,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, return rc; } } else if (0 == strcmp(info->key, OPAL_PMIX_RANKBY)) { + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } if (ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { /* not allowed to provide multiple ranking policies */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", @@ -217,6 +231,9 @@ int pmix_server_spawn_fn(opal_process_name_t *requestor, return rc; } } else if (0 == strcmp(info->key, OPAL_PMIX_BINDTO)) { + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* not allowed to provide multiple mapping policies */ orte_show_help("help-opal-hwloc-base.txt", "redefining-policy", true, diff --git a/orte/tools/Makefile.am b/orte/tools/Makefile.am index 51f60c290c2..2651e373acb 100644 --- a/orte/tools/Makefile.am +++ b/orte/tools/Makefile.am @@ -13,7 +13,7 @@ # Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011-2013 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2014-2016 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -36,6 +36,7 @@ SUBDIRS += \ tools/orte-info \ tools/orte-migrate \ tools/orte-server \ + tools/orte-submit \ tools/orte-dvm DIST_SUBDIRS += \ @@ -50,5 +51,6 @@ DIST_SUBDIRS += \ tools/orte-info \ tools/orte-migrate \ tools/orte-server \ + tools/orte-submit \ tools/orte-dvm diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.c b/orte/tools/orte-checkpoint/orte-checkpoint.c index b732f076348..b51d8f538ce 100644 --- a/orte/tools/orte-checkpoint/orte-checkpoint.c +++ b/orte/tools/orte-checkpoint/orte-checkpoint.c @@ -417,7 +417,7 @@ static int parse_args(int argc, char *argv[]) { /** * Put all of the MCA arguments in the environment */ - mca_base_cmd_line_process_args(argc, &app_env, &global_env); + mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { diff --git a/orte/tools/orte-dvm/orte-dvm.c b/orte/tools/orte-dvm/orte-dvm.c index 2a05a685c86..479aed125b4 100644 --- a/orte/tools/orte-dvm/orte-dvm.c +++ b/orte/tools/orte-dvm/orte-dvm.c @@ -234,7 +234,7 @@ int main(int argc, char *argv[]) * opal_init_util() since mca_base_cmd_line_process_args() does *not* * depend upon opal_init_util() functionality. */ - if (OPAL_SUCCESS != mca_base_cmd_line_process_args(argv, &environ, &environ)) { + if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) { exit(1); } diff --git a/orte/tools/orte-info/orte-info.c b/orte/tools/orte-info/orte-info.c index 8ff27811eee..5b2230fbb52 100644 --- a/orte/tools/orte-info/orte-info.c +++ b/orte/tools/orte-info/orte-info.c @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) exit(cmd_error ? 1 : 0); } - mca_base_cmd_line_process_args(argv, &app_env, &global_env); + mca_base_cmd_line_process_args(orte_info_cmd_line, &app_env, &global_env); /* putenv() all the stuff that we got back from env (in case the * user specified some --mca params on the command line). This diff --git a/orte/tools/orte-migrate/orte-migrate.c b/orte/tools/orte-migrate/orte-migrate.c index 1a8bb11d0a4..6b7f9827ad7 100644 --- a/orte/tools/orte-migrate/orte-migrate.c +++ b/orte/tools/orte-migrate/orte-migrate.c @@ -304,7 +304,7 @@ static int parse_args(int argc, char *argv[]) { /** * Put all of the MCA arguments in the environment */ - mca_base_cmd_line_process_args(argv, &app_env, &global_env); + mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { diff --git a/orte/tools/orte-restart/orte-restart.c b/orte/tools/orte-restart/orte-restart.c index 5b94a10093e..9a7974e8e64 100644 --- a/orte/tools/orte-restart/orte-restart.c +++ b/orte/tools/orte-restart/orte-restart.c @@ -461,7 +461,7 @@ static int parse_args(int argc, char *argv[]) /** * Put all of the MCA arguments in the environment */ - mca_base_cmd_line_process_args(argv, &app_env, &global_env); + mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env); len = opal_argv_count(app_env); for(i = 0; i < len; ++i) { diff --git a/orte/tools/orte-server/orte-server.c b/orte/tools/orte-server/orte-server.c index 89ac610970f..46ef0b90465 100644 --- a/orte/tools/orte-server/orte-server.c +++ b/orte/tools/orte-server/orte-server.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]) * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ - mca_base_cmd_line_process_args(argv, &environ, &environ); + mca_base_cmd_line_process_args(cmd_line, &environ, &environ); /* if debug is set, then set orte_debug_flag so that the data server * code will output diff --git a/orte/tools/orte-submit/Makefile.am b/orte/tools/orte-submit/Makefile.am new file mode 100644 index 00000000000..93d7e1068e1 --- /dev/null +++ b/orte/tools/orte-submit/Makefile.am @@ -0,0 +1,57 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2015 Intel, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This is not quite in the Automake spirit, but we have to do it. +# Since the totalview portion of the library must be built with -g, we +# must eliminate the CFLAGS that are passed in here by default (which +# may already have debugging and/or optimization flags). We use +# post-processed forms of the CFLAGS in the library targets down +# below. + +CFLAGS = $(CFLAGS_WITHOUT_OPTFLAGS) $(DEBUGGER_CFLAGS) + +include $(top_srcdir)/Makefile.ompi-rules + +man_pages = orte-submit.1 +EXTRA_DIST = $(man_pages:.1=.1in) + +if OPAL_INSTALL_BINARIES + +bin_PROGRAMS = orte-submit + +nodist_man_MANS = $(man_pages) + +# Ensure that the man pages are rebuilt if the opal_config.h file +# changes; a "good enough" way to know if configure was run again (and +# therefore the release date or version may have changed) +$(nodist_man_MANS): $(top_builddir)/opal/include/opal_config.h + +endif # OPAL_INSTALL_BINARIES + +orte_submit_SOURCES = \ + orte-submit.c + +orte_submit_LDADD = \ + $(top_builddir)/orte/lib@ORTE_LIB_PREFIX@open-rte.la \ + $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la + +distclean-local: + rm -f $(man_pages) diff --git a/orte/tools/orte-submit/orte-submit.1in b/orte/tools/orte-submit/orte-submit.1in new file mode 100644 index 00000000000..d37c48188b8 --- /dev/null +++ b/orte/tools/orte-submit/orte-submit.1in @@ -0,0 +1,1428 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. +.\" Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. +.\” Copyright (c) 2015 Intel, Inc. All rights reserved. +.\" $COPYRIGHT$ +.\" +.\" Man page for ORTE's orte-submit command +.\" +.\" .TH name section center-footer left-footer center-header +.TH ORTE-SUBMIT 1 "#OMPI_DATE#" "#PACKAGE_VERSION#" "#PACKAGE_NAME#" +.\" ************************** +.\" Name Section +.\" ************************** +.SH NAME +. +orte-submit, ompi-submit \- Execute serial and parallel jobs in Open MPI using a DVM. + +.B Note: +\fIompi-submit\fP and \fIorte-submit\fP are synonyms for each +other. Using either of the names will produce the same behavior. +. +.\" ************************** +.\" Synopsis Section +.\" ************************** +.SH SYNOPSIS +. +.PP +Single Process Multiple Data (SPMD) Model: + +.B ompi-submit +[ options ] +.B +[ ] +.P + +Multiple Instruction Multiple Data (MIMD) Model: + +.B ompi-submit +[ global_options ] + [ local_options1 ] +.B +[ ] : + [ local_options2 ] +.B +[ ] : + ... : + [ local_optionsN ] +.B +[ ] +.P + +Note that in both models, invoking \fIompi-submit\fP via an absolute path +name is equivalent to specifying the \fI--prefix\fP option with a +\fI\fR value equivalent to the directory where \fIompi-submit\fR +resides, minus its last subdirectory. For example: + + \fB%\fP /usr/local/bin/ompi-submit ... + +is equivalent to + + \fB%\fP ompi-submit --prefix /usr/local + +. +.\" ************************** +.\" Quick Summary Section +.\" ************************** +.SH QUICK SUMMARY +. +.B +Use of \fIorte-submit\fP requires that you first start the Distributed Virtual +Machine (DVM) using \fIorte-dvm\fP. +.P +If you are simply looking for how to run an MPI application, you +probably want to use a command line of the following form: + + \fB%\fP ompi-submit [ -np X ] [ --hostfile ] + +This will run X copies of \fI\fR in your current run-time +environment (if running under a supported resource manager, Open MPI's +\fIompi-submit\fR will usually automatically use the corresponding resource manager +process starter, as opposed to, for example, \fIrsh\fR or \fIssh\fR, +which require the use of a hostfile, or will default to running all X +copies on the localhost), scheduling (by default) in a round-robin fashion by +CPU slot. See the rest of this page for more details. +.P +Please note that ompi-submit automatically binds processes as of the start of the +v1.8 series. Two binding patterns are used in the absence of any further directives: +.TP 18 +.B Bind to core: +when the number of processes is <= 2 +. +. +.TP +.B Bind to socket: +when the number of processes is > 2 +. +. +.P +If your application uses threads, then you probably want to ensure that you are +either not bound at all (by specifying --bind-to none), or bound to multiple cores +using an appropriate binding level or specific number of processing elements per +application process. +. +.\" ************************** +.\" Options Section +.\" ************************** +.SH OPTIONS +. +.I ompi-submit +will send the name of the directory where it was invoked on the local +node to each of the remote nodes, and attempt to change to that +directory. See the "Current Working Directory" section below for further +details. +.\" +.\" Start options listing +.\" Indent 10 characters from start of first column to start of second column +.TP 10 +.B +The program executable. This is identified as the first non-recognized argument +to ompi-submit. +. +. +.TP +.B +Pass these run-time arguments to every new process. These must always +be the last arguments to \fIompi-submit\fP. If an app context file is used, +\fI\fP will be ignored. +. +. +.TP +.B -h\fR,\fP --help +Display help for this command +. +. +.TP +.B -q\fR,\fP --quiet +Suppress informative messages from orte-submit during application execution. +. +. +.TP +.B -v\fR,\fP --verbose +Be verbose +. +. +.TP +.B -V\fR,\fP --version +Print version number. If no other arguments are given, this will also +cause orte-submit to exit. +. +. +. +. +.P +Use one of the following options to specify which hosts (nodes) of the DVM to run on. +Specifying hosts outside the DVM will result in an error. +. +. +.TP +.B -H\fR,\fP -host\fR,\fP --host \fR\fP +List of hosts on which to invoke processes. +. +. +.TP +.B +-hostfile\fR,\fP --hostfile \fR\fP +Provide a hostfile to use. +.\" JJH - Should have man page for how to format a hostfile properly. +. +. +.TP +.B -machinefile\fR,\fP --machinefile \fR\fP +Synonym for \fI-hostfile\fP. +. +. +. +. +.P +The following options specify the number of processes to launch. Note that none +of the options imply a particular binding policy - e.g., requesting N processes +for each socket does not imply that the processes will be bound to the socket. +. +. +.TP +.B -c\fR,\fP -n\fR,\fP --n\fR,\fP -np \fR<#>\fP +Run this many copies of the program on the given nodes. This option +indicates that the specified file is an executable program and not an +application context. If no value is provided for the number of copies to +execute (i.e., neither the "-np" nor its synonyms are provided on the command +line), Open MPI will automatically execute a copy of the program on +each process slot (see below for description of a "process slot"). This +feature, however, can only be used in the SPMD model and will return an +error (without beginning execution of the application) otherwise. +. +. +.TP +.B —map-by ppr:N: +Launch N times the number of objects of the specified type on each node. +. +. +.TP +.B -npersocket\fR,\fP --npersocket <#persocket> +On each node, launch this many processes times the number of processor +sockets on the node. +The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option. +(deprecated in favor of --map-by ppr:n:socket) +. +. +.TP +.B -npernode\fR,\fP --npernode <#pernode> +On each node, launch this many processes. +(deprecated in favor of --map-by ppr:n:node) +. +. +.TP +.B -pernode\fR,\fP --pernode +On each node, launch one process -- equivalent to \fI-npernode\fP 1. +(deprecated in favor of --map-by ppr:1:node) +. +. +. +. +.P +To map processes: +. +. +.TP +.B --map-by +Map to the specified object, defaults to \fIsocket\fP. Supported options +include slot, hwthread, core, L1cache, L2cache, L3cache, socket, numa, +board, node, sequential, distance, and ppr. Any object can include +modifiers by adding a \fR:\fP and any combination of PE=n (bind n +processing elements to each proc), SPAN (load +balance the processes across the allocation), OVERSUBSCRIBE (allow +more processes on a node than processing elements), and NOOVERSUBSCRIBE. +This includes PPR, where the pattern would be terminated by another colon +to separate it from the modifiers. +. +.TP +.B -bycore\fR,\fP --bycore +Map processes by core (deprecated in favor of --map-by core) +. +.TP +.B -bysocket\fR,\fP --bysocket +Map processes by socket (deprecated in favor of --map-by socket) +. +.TP +.B -nolocal\fR,\fP --nolocal +Do not run any copies of the launched application on the same node as +orte-submit is running. This option will override listing the localhost +with \fB--host\fR or any other host-specifying mechanism. +. +.TP +.B -nooversubscribe\fR,\fP --nooversubscribe +Do not oversubscribe any nodes; error (without starting any processes) +if the requested number of processes would cause oversubscription. +This option implicitly sets "max_slots" equal to the "slots" value for +each node. +. +.TP +.B -bynode\fR,\fP --bynode +Launch processes one per node, cycling by node in a round-robin +fashion. This spreads processes evenly among nodes and assigns +MPI_COMM_WORLD ranks in a round-robin, "by node" manner. +. +. +. +. +.P +To order processes' ranks in MPI_COMM_WORLD: +. +. +.TP +.B --rank-by +Rank in round-robin fashion according to the specified object, +defaults to \fIslot\fP. Supported options +include slot, hwthread, core, L1cache, L2cache, L3cache, +socket, numa, board, and node. +. +. +. +. +.P +For process binding: +. +.TP +.B --bind-to +Bind processes to the specified object, defaults to \fIcore\fP. Supported options +include slot, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board, and none. +. +.TP +.B -cpus-per-proc\fR,\fP --cpus-per-proc <#perproc> +Bind each process to the specified number of cpus. +(deprecated in favor of --map-by :PE=n) +. +.TP +.B -cpus-per-rank\fR,\fP --cpus-per-rank <#perrank> +Alias for \fI-cpus-per-proc\fP. +(deprecated in favor of --map-by :PE=n) +. +.TP +.B -bind-to-core\fR,\fP --bind-to-core +Bind processes to cores (deprecated in favor of --bind-to core) +. +.TP +.B -bind-to-socket\fR,\fP --bind-to-socket +Bind processes to processor sockets (deprecated in favor of --bind-to socket) +. +.TP +.B -bind-to-none\fR,\fP --bind-to-none +Do not bind processes (deprecated in favor of --bind-to none) +. +.TP +.B -report-bindings\fR,\fP --report-bindings +Report any bindings for launched processes. +. +.TP +.B -slot-list\fR,\fP --slot-list +List of processor IDs to be used for binding MPI processes. The specified bindings will +be applied to all MPI processes. See explanation below for syntax. +. +. +. +. +.P +For rankfiles: +. +. +.TP +.B -rf\fR,\fP --rankfile +Provide a rankfile file. +. +. +. +. +.P +To manage standard I/O: +. +. +.TP +.B -output-filename\fR,\fP --output-filename \fR\fP +Redirect the stdout, stderr, and stddiag of all processes to a process-unique version of +the specified filename. Any directories in the filename will automatically be created. +Each output file will consist of filename.id, where the id will be the +processes' rank in MPI_COMM_WORLD, left-filled with +zero's for correct ordering in listings. +. +. +.TP +.B -stdin\fR,\fP --stdin +The MPI_COMM_WORLD rank of the process that is to receive stdin. The +default is to forward stdin to MPI_COMM_WORLD rank 0, but this option +can be used to forward stdin to any process. It is also acceptable to +specify \fInone\fP, indicating that no processes are to receive stdin. +. +. +.TP +.B -tag-output\fR,\fP --tag-output +Tag each line of output to stdout, stderr, and stddiag with \fB[jobid, MCW_rank]\fP indicating the process jobid +and MPI_COMM_WORLD rank of the process that generated the output, and the channel which generated it. +. +. +.TP +.B -timestamp-output\fR,\fP --timestamp-output +Timestamp each line of output to stdout, stderr, and stddiag. +. +. +.TP +.B -xml\fR,\fP --xml +Provide all output to stdout, stderr, and stddiag in an xml format. +. +. +.TP +.B -xterm\fR,\fP --xterm \fR\fP +Display the output from the processes identified by their +MPI_COMM_WORLD ranks in separate xterm windows. The ranks are specified +as a comma-separated list of ranges, with a -1 indicating all. A separate +window will be created for each specified process. +.B Note: +xterm will normally terminate the window upon termination of the process running +within it. However, by adding a "!" to the end of the list of specified ranks, +the proper options will be provided to ensure that xterm keeps the window open +\fIafter\fP the process terminates, thus allowing you to see the process' output. +Each xterm window will subsequently need to be manually closed. +.B Note: +In some environments, xterm may require that the executable be in the user's +path, or be specified in absolute or relative terms. Thus, it may be necessary +to specify a local executable as "./foo" instead of just "foo". If xterm fails to +find the executable, ompi-submit will hang, but still respond correctly to a ctrl-c. +If this happens, please check that the executable is being specified correctly +and try again. +. +. +. +. +.P +To manage files and runtime environment: +. +. +.TP +.B -path\fR,\fP --path \fR\fP + that will be used when attempting to locate the requested +executables. This is used prior to using the local PATH setting. +. +. +.TP +.B --prefix \fR\fP +Prefix directory that will be used to set the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote node before invoking Open MPI or +the target process. See the "Remote Execution" section, below. +. +. +.TP +.B --preload-binary +Copy the specified executable(s) to remote machines prior to starting remote processes. The +executables will be copied to the Open MPI session directory and will be deleted upon +completion of the job. +. +. +.TP +.B --preload-files +Preload the comma separated list of files to the current working directory of the remote +machines where processes will be launched prior to starting those processes. +. +. +.TP +.B --preload-files-dest-dir +The destination directory to be used for preload-files, if other than the current working +directory. By default, the absolute and relative paths provided by --preload-files are used. +. +. +.TP +.B -wd \fR\fP +Synonym for \fI-wdir\fP. +. +. +.TP +.B -wdir \fR\fP +Change to the directory before the user's program executes. +See the "Current Working Directory" section for notes on relative paths. +.B Note: +If the \fI-wdir\fP option appears both on the command line and in an +application context, the context will take precedence over the command +line. Thus, if the path to the desired wdir is different +on the backend nodes, then it must be specified as an absolute path that +is correct for the backend node. +. +. +.TP +.B -x \fR\fP +Export the specified environment variables to the remote nodes before +executing the program. Only one environment variable can be specified +per \fI-x\fP option. Existing environment variables can be specified +or new variable names specified with corresponding values. For +example: + \fB%\fP ompi-submit -x DISPLAY -x OFILE=/tmp/out ... + +The parser for the \fI-x\fP option is not very sophisticated; it does +not even understand quoted values. Users are advised to set variables +in the environment, and then use \fI-x\fP to export (not define) them. +. +. +. +. +.P +Setting MCA parameters: +. +. +.TP +.B -gmca\fR,\fP --gmca \fR \fP +Pass global MCA parameters that are applicable to all contexts. \fI\fP is +the parameter name; \fI\fP is the parameter value. +. +. +.TP +.B -mca\fR,\fP --mca +Send arguments to various MCA modules. See the "MCA" section, below. +. +. +. +. +.P +For debugging: +. +. +.TP +.B -debug\fR,\fP --debug +Invoke the user-level debugger indicated by the \fIorte_base_user_debugger\fP +MCA parameter. +. +. +.TP +.B -debugger\fR,\fP --debugger +Sequence of debuggers to search for when \fI--debug\fP is used (i.e. +a synonym for \fIorte_base_user_debugger\fP MCA parameter). +. +. +.TP +.B -tv\fR,\fP --tv +Launch processes under the TotalView debugger. +Deprecated backwards compatibility flag. Synonym for \fI--debug\fP. +. +. +. +. +.P +There are also other options: +. +. +.TP +.B --allow-run-as-root +Allow +.I ompi-submit +to run when executed by the root user +.RI ( ompi-submit +defaults to aborting when launched as the root user). +. +. +.TP +.B -aborted\fR,\fP --aborted \fR<#>\fP +Set the maximum number of aborted processes to display. +. +. +.TP +.B --app \fR\fP +Provide an appfile, ignoring all other command line options. +. +. +.TP +.B -cf\fR,\fP --cartofile \fR\fP +Provide a cartography file. +. +. +.TP +.B --hetero +Indicates that multiple app_contexts are being provided that are a mix of 32/64-bit binaries. +. +. +.TP +.B -ompi-server\fR,\fP --ompi-server +Specify the URI of the Open MPI server (or the ompi-submit to be used as the server) +, the name +of the file (specified as file:filename) that +contains that info, or the PID (specified as pid:#) of the ompi-submit to be used as + the server. +The Open MPI server is used to support multi-application data exchange via +the MPI-2 MPI_Publish_name and MPI_Lookup_name functions. +. +. +. +. +.P +The following options are useful for developers; they are not generally +useful to most ORTE and/or MPI users: +. +.TP +.B -d\fR,\fP --debug-devel +Enable debugging of the OmpiRTE (the run-time layer in Open MPI). +This is not generally useful for most users. +. +. +. +.P +There may be other options listed with \fIompi-submit --help\fP. +. +. +.SS Environment Variables +. +.TP +.B MPIEXEC_TIMEOUT +The maximum number of seconds that +.I ompi-submit +.RI ( mpiexec ) +will run. After this many seconds, +.I ompi-submit +will abort the launched job and exit. +. +. +.\" ************************** +.\" Description Section +.\" ************************** +.SH DESCRIPTION +. +One invocation of \fIompi-submit\fP starts an MPI application running under Open +MPI. If the application is single process multiple data (SPMD), the application +can be specified on the \fIompi-submit\fP command line. + +If the application is multiple instruction multiple data (MIMD), comprising of +multiple programs, the set of programs and argument can be specified in one of +two ways: Extended Command Line Arguments, and Application Context. +.PP +An application context describes the MIMD program set including all arguments +in a separate file. +.\" See appcontext(5) for a description of the application context syntax. +This file essentially contains multiple \fIompi-submit\fP command lines, less the +command name itself. The ability to specify different options for different +instantiations of a program is another reason to use an application context. +.PP +Extended command line arguments allow for the description of the application +layout on the command line using colons (\fI:\fP) to separate the specification +of programs and arguments. Some options are globally set across all specified +programs (e.g. --hostfile), while others are specific to a single program +(e.g. -np). +. +. +. +.SS Specifying Host Nodes +. +Host nodes can be identified on the \fIompi-submit\fP command line with the \fI-host\fP +option or in a hostfile. +. +.PP +For example, +. +.TP 4 +ompi-submit -H aa,aa,bb ./a.out +launches two processes on node aa and one on bb. +. +.PP +Or, consider the hostfile +. + + \fB%\fP cat myhostfile + aa slots=2 + bb slots=2 + cc slots=2 + +. +.PP +Since the DVM was started with \fIorte-dvm\fP, \fIorte-submit\fP +will ignore any slots arguments in the hostfile. Values provided +via hostfile to \fIorte-dvm\fP will control the behavior. +. +.PP +. +.TP 4 +ompi-submit -hostfile myhostfile ./a.out +will launch two processes on each of the three nodes. +. +.TP 4 +ompi-submit -hostfile myhostfile -host aa ./a.out +will launch two processes, both on node aa. +. +.TP 4 +ompi-submit -hostfile myhostfile -host dd ./a.out +will find no hosts to run on and abort with an error. +That is, the specified host dd is not in the specified hostfile. +. +.SS Specifying Number of Processes +. +As we have just seen, the number of processes to run can be set using the +hostfile. Other mechanisms exist. +. +.PP +The number of processes launched can be specified as a multiple of the +number of nodes or processor sockets available. For example, +. +.TP 4 +ompi-submit -H aa,bb -npersocket 2 ./a.out +launches processes 0-3 on node aa and process 4-7 on node bb, +where aa and bb are both dual-socket nodes. +The \fI-npersocket\fP option also turns on the \fI-bind-to-socket\fP option, +which is discussed in a later section. +. +.TP 4 +ompi-submit -H aa,bb -npernode 2 ./a.out +launches processes 0-1 on node aa and processes 2-3 on node bb. +. +.TP 4 +ompi-submit -H aa,bb -npernode 1 ./a.out +launches one process per host node. +. +.TP 4 +ompi-submit -H aa,bb -pernode ./a.out +is the same as \fI-npernode\fP 1. +. +. +.PP +Another alternative is to specify the number of processes with the +\fI-np\fP option. Consider now the hostfile +. + + \fB%\fP cat myhostfile + aa slots=4 + bb slots=4 + cc slots=4 + +. +.PP +Now, +. +.TP 4 +ompi-submit -hostfile myhostfile -np 6 ./a.out +will launch processes 0-3 on node aa and processes 4-5 on node bb. The remaining +slots in the hostfile will not be used since the \fI-np\fP option indicated +that only 6 processes should be launched. +. +.SS Mapping Processes to Nodes: Using Policies +. +The examples above illustrate the default mapping of process processes +to nodes. This mapping can also be controlled with various +\fIompi-submit\fP options that describe mapping policies. +. +. +.PP +Consider the same hostfile as above, again with \fI-np\fP 6: +. + + node aa node bb node cc + + ompi-submit 0 1 2 3 4 5 + + ompi-submit --map-by node 0 3 1 4 2 5 + + ompi-submit -nolocal 0 1 2 3 4 5 +. +.PP +The \fI--map-by node\fP option will load balance the processes across +the available nodes, numbering each process in a round-robin fashion. +. +.PP +The \fI-nolocal\fP option prevents any processes from being mapped onto the +local host (in this case node aa). While \fIompi-submit\fP typically consumes +few system resources, \fI-nolocal\fP can be helpful for launching very +large jobs where \fIompi-submit\fP may actually need to use noticeable amounts +of memory and/or processing time. +. +.PP +Just as \fI-np\fP can specify fewer processes than there are slots, it can +also oversubscribe the slots. For example, with the same hostfile: +. +.TP 4 +ompi-submit -hostfile myhostfile -np 14 ./a.out +will launch processes 0-3 on node aa, 4-7 on bb, and 8-11 on cc. It will +then add the remaining two processes to whichever nodes it chooses. +. +.PP +One can also specify limits to oversubscription. For example, with the same +hostfile: +. +.TP 4 +ompi-submit -hostfile myhostfile -np 14 -nooversubscribe ./a.out +will produce an error since \fI-nooversubscribe\fP prevents oversubscription. +. +.PP +Limits to oversubscription can also be specified in the hostfile itself: +. + % cat myhostfile + aa slots=4 max_slots=4 + bb max_slots=4 + cc slots=4 +. +.PP +The \fImax_slots\fP field specifies such a limit. When it does, the +\fIslots\fP value defaults to the limit. Now: +. +.TP 4 +ompi-submit -hostfile myhostfile -np 14 ./a.out +causes the first 12 processes to be launched as before, but the remaining +two processes will be forced onto node cc. The other two nodes are +protected by the hostfile against oversubscription by this job. +. +.PP +Using the \fI--nooversubscribe\fR option can be helpful since Open MPI +currently does not get "max_slots" values from the resource manager. +. +.PP +Of course, \fI-np\fP can also be used with the \fI-H\fP or \fI-host\fP +option. For example, +. +.TP 4 +ompi-submit -H aa,bb -np 8 ./a.out +launches 8 processes. Since only two hosts are specified, after the first +two processes are mapped, one to aa and one to bb, the remaining processes +oversubscribe the specified hosts. +. +.PP +And here is a MIMD example: +. +.TP 4 +ompi-submit -H aa -np 1 hostname : -H bb,cc -np 2 uptime +will launch process 0 running \fIhostname\fP on node aa and processes 1 and 2 +each running \fIuptime\fP on nodes bb and cc, respectively. +. +.SS Mapping, Ranking, and Binding: Oh My! +. +Open MPI employs a three-phase procedure for assigning process locations and +ranks: +. +.TP 10 +\fBmapping\fP +Assigns a default location to each process +. +.TP 10 +\fBranking\fP +Assigns an MPI_COMM_WORLD rank value to each process +. +.TP 10 +\fBbinding\fP +Constrains each process to run on specific processors +. +.PP +The \fImapping\fP step is used to assign a default location to each process +based on the mapper being employed. Mapping by slot, node, and sequentially results +in the assignment of the processes to the node level. In contrast, mapping by object, allows +the mapper to assign the process to an actual object on each node. +. +.PP +\fBNote:\fP the location assigned to the process is independent of where it will be bound - the +assignment is used solely as input to the binding algorithm. +. +.PP +The mapping of process processes to nodes can be defined not just +with general policies but also, if necessary, using arbitrary mappings +that cannot be described by a simple policy. One can use the "sequential +mapper," which reads the hostfile line by line, assigning processes +to nodes in whatever order the hostfile specifies. Use the +\fI-mca rmaps seq\fP option. For example, using the same hostfile +as before: +. +.PP +ompi-submit -hostfile myhostfile -mca rmaps seq ./a.out +. +.PP +will launch three processes, one on each of nodes aa, bb, and cc, respectively. +The slot counts don't matter; one process is launched per line on +whatever node is listed on the line. +. +.PP +Another way to specify arbitrary mappings is with a rankfile, which +gives you detailed control over process binding as well. Rankfiles +are discussed below. +. +.PP +The second phase focuses on the \fIranking\fP of the process within +the job's MPI_COMM_WORLD. Open MPI +separates this from the mapping procedure to allow more flexibility in the +relative placement of MPI processes. This is best illustrated by considering the +following two cases where we used the —map-by ppr:2:socket option: +. +.PP + node aa node bb + + rank-by core 0 1 ! 2 3 4 5 ! 6 7 + + rank-by socket 0 2 ! 1 3 4 6 ! 5 7 + + rank-by socket:span 0 4 ! 1 5 2 6 ! 3 7 +. +.PP +Ranking by core and by slot provide the identical result - a simple +progression of MPI_COMM_WORLD ranks across each node. Ranking by +socket does a round-robin ranking within each node until all processes +have been assigned an MCW rank, and then progresses to the next +node. Adding the \fIspan\fP modifier to the ranking directive causes +the ranking algorithm to treat the entire allocation as a single +entity - thus, the MCW ranks are assigned across all sockets before +circling back around to the beginning. +. +.PP +The \fIbinding\fP phase actually binds each process to a given set of processors. This can +improve performance if the operating system is placing processes +suboptimally. For example, it might oversubscribe some multi-core +processor sockets, leaving other sockets idle; this can lead +processes to contend unnecessarily for common resources. Or, it +might spread processes out too widely; this can be suboptimal if +application performance is sensitive to interprocess communication +costs. Binding can also keep the operating system from migrating +processes excessively, regardless of how optimally those processes +were placed to begin with. +. +.PP +The processors to be used for binding can be identified in terms of +topological groupings - e.g., binding to an l3cache will bind each +process to all processors within the scope of a single L3 cache within +their assigned location. Thus, if a process is assigned by the mapper +to a certain socket, then a \fI—bind-to l3cache\fP directive will +cause the process to be bound to the processors that share a single L3 +cache within that socket. +. +.PP +To help balance loads, the binding directive uses a round-robin method when binding to +levels lower than used in the mapper. For example, consider the case where a job is +mapped to the socket level, and then bound to core. Each socket will have multiple cores, +so if multiple processes are mapped to a given socket, the binding algorithm will assign +each process located to a socket to a unique core in a round-robin manner. +. +.PP +Alternatively, processes mapped by l2cache and then bound to socket will simply be bound +to all the processors in the socket where they are located. In this manner, users can +exert detailed control over relative MCW rank location and binding. +. +.PP +Finally, \fI--report-bindings\fP can be used to report bindings. +. +.PP +As an example, consider a node with two processor sockets, each comprising +four cores. We run \fIompi-submit\fP with \fI-np 4 --report-bindings\fP and +the following additional options: +. + + % ompi-submit ... --map-by core --bind-to core + [...] ... binding child [...,0] to cpus 0001 + [...] ... binding child [...,1] to cpus 0002 + [...] ... binding child [...,2] to cpus 0004 + [...] ... binding child [...,3] to cpus 0008 + + % ompi-submit ... --map-by socket --bind-to socket + [...] ... binding child [...,0] to socket 0 cpus 000f + [...] ... binding child [...,1] to socket 1 cpus 00f0 + [...] ... binding child [...,2] to socket 0 cpus 000f + [...] ... binding child [...,3] to socket 1 cpus 00f0 + + % ompi-submit ... --map-by core:PE=2 --bind-to core + [...] ... binding child [...,0] to cpus 0003 + [...] ... binding child [...,1] to cpus 000c + [...] ... binding child [...,2] to cpus 0030 + [...] ... binding child [...,3] to cpus 00c0 + + % ompi-submit ... --bind-to none +. +.PP +Here, \fI--report-bindings\fP shows the binding of each process as a mask. +In the first case, the processes bind to successive cores as indicated by +the masks 0001, 0002, 0004, and 0008. In the second case, processes bind +to all cores on successive sockets as indicated by the masks 000f and 00f0. +The processes cycle through the processor sockets in a round-robin fashion +as many times as are needed. In the third case, the masks show us that +2 cores have been bound per process. In the fourth case, binding is +turned off and no bindings are reported. +. +.PP +Open MPI's support for process binding depends on the underlying +operating system. Therefore, certain process binding options may not be available +on every system. +. +.PP +Process binding can also be set with MCA parameters. +Their usage is less convenient than that of \fIompi-submit\fP options. +On the other hand, MCA parameters can be set not only on the \fIompi-submit\fP +command line, but alternatively in a system or user mca-params.conf file +or as environment variables, as described in the MCA section below. +Some examples include: +. +.PP + ompi-submit option MCA parameter key value + + --map-by core rmaps_base_mapping_policy core + --map-by socket rmaps_base_mapping_policy socket + --rank-by core rmaps_base_ranking_policy core + --bind-to core hwloc_base_binding_policy core + --bind-to socket hwloc_base_binding_policy socket + --bind-to none hwloc_base_binding_policy none +. +. +.SS Rankfiles +. +Rankfiles are text files that specify detailed information about how +individual processes should be mapped to nodes, and to which +processor(s) they should be bound. Each line of a rankfile specifies +the location of one process (for MPI jobs, the process' "rank" refers +to its rank in MPI_COMM_WORLD). The general form of each line in the +rankfile is: +. + + rank = slot= +. +.PP +For example: +. + + $ cat myrankfile + rank 0=aa slot=1:0-2 + rank 1=bb slot=0:0,1 + rank 2=cc slot=1-2 + $ ompi-submit -H aa,bb,cc,dd -rf myrankfile ./a.out +. +.PP +Means that +. + + Rank 0 runs on node aa, bound to logical socket 1, cores 0-2. + Rank 1 runs on node bb, bound to logical socket 0, cores 0 and 1. + Rank 2 runs on node cc, bound to logical cores 1 and 2. +. +.PP +Rankfiles can alternatively be used to specify \fIphysical\fP processor +locations. In this case, the syntax is somewhat different. Sockets are +no longer recognized, and the slot number given must be the number of +the physical PU as most OS's do not assign a unique physical identifier +to each core in the node. Thus, a proper physical rankfile looks something +like the following: +. + + $ cat myphysicalrankfile + rank 0=aa slot=1 + rank 1=bb slot=8 + rank 2=cc slot=6 +. +.PP +This means that +. + + Rank 0 will run on node aa, bound to the core that contains physical PU 1 + Rank 1 will run on node bb, bound to the core that contains physical PU 8 + Rank 2 will run on node cc, bound to the core that contains physical PU 6 +. +.PP +Rankfiles are treated as \fIlogical\fP by default, and the MCA parameter +rmaps_rank_file_physical must be set to 1 to indicate that the rankfile +is to be considered as \fIphysical\fP. +. +.PP +The hostnames listed above are "absolute," meaning that actual +resolveable hostnames are specified. However, hostnames can also be +specified as "relative," meaning that they are specified in relation +to an externally-specified list of hostnames (e.g., by ompi-submit's --host +argument, a hostfile, or a job scheduler). +. +.PP +The "relative" specification is of the form "+n", where X is an +integer specifying the Xth hostname in the set of all available +hostnames, indexed from 0. For example: +. + + $ cat myrankfile + rank 0=+n0 slot=1:0-2 + rank 1=+n1 slot=0:0,1 + rank 2=+n2 slot=1-2 + $ ompi-submit -H aa,bb,cc,dd -rf myrankfile ./a.out +. +.PP +Starting with Open MPI v1.7, all socket/core slot locations are be +specified as +.I logical +indexes (the Open MPI v1.6 series used +.I physical +indexes). You can use tools such as HWLOC's "lstopo" to find the +logical indexes of socket and cores. +. +. +.SS Application Context or Executable Program? +. +To distinguish the two different forms, \fIompi-submit\fP +looks on the command line for \fI--app\fP option. If +it is specified, then the file named on the command line is +assumed to be an application context. If it is not +specified, then the file is assumed to be an executable program. +. +. +. +.SS Locating Files +. +If no relative or absolute path is specified for a file, Open +MPI will first look for files by searching the directories specified +by the \fI--path\fP option. If there is no \fI--path\fP option set or +if the file is not found at the \fI--path\fP location, then Open MPI +will search the user's PATH environment variable as defined on the +source node(s). +.PP +If a relative directory is specified, it must be relative to the initial +working directory determined by the specific starter used. For example when +using the rsh or ssh starters, the initial directory is $HOME by default. Other +starters may set the initial directory to the current working directory from +the invocation of \fIompi-submit\fP. +. +. +. +.SS Current Working Directory +. +The \fI\-wdir\fP ompi-submit option (and its synonym, \fI\-wd\fP) allows +the user to change to an arbitrary directory before the program is +invoked. It can also be used in application context files to specify +working directories on specific nodes and/or for specific +applications. +.PP +If the \fI\-wdir\fP option appears both in a context file and on the +command line, the context file directory will override the command +line value. +.PP +If the \fI-wdir\fP option is specified, Open MPI will attempt to +change to the specified directory on all of the remote nodes. If this +fails, \fIompi-submit\fP will abort. +.PP +If the \fI-wdir\fP option is \fBnot\fP specified, Open MPI will send +the directory name where \fIompi-submit\fP was invoked to each of the +remote nodes. The remote nodes will try to change to that +directory. If they are unable (e.g., if the directory does not exist on +that node), then Open MPI will use the default directory determined by +the starter. +.PP +All directory changing occurs before the user's program is invoked; it +does not wait until \fIMPI_INIT\fP is called. +. +. +. +.SS Standard I/O +. +Open MPI directs UNIX standard input to /dev/null on all processes +except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process +inherits standard input from \fIompi-submit\fP. +.B Note: +The node that invoked \fIompi-submit\fP need not be the same as the node where the +MPI_COMM_WORLD rank 0 process resides. Open MPI handles the redirection of +\fIompi-submit\fP's standard input to the rank 0 process. +.PP +Open MPI directs UNIX standard output and error from remote nodes to the node +that invoked \fIompi-submit\fP and prints it on the standard output/error of +\fIompi-submit\fP. +Local processes inherit the standard output/error of \fIompi-submit\fP and transfer +to it directly. +.PP +Thus it is possible to redirect standard I/O for Open MPI applications by +using the typical shell redirection procedure on \fIompi-submit\fP. + + \fB%\fP ompi-submit -np 2 my_app < my_input > my_output + +Note that in this example \fIonly\fP the MPI_COMM_WORLD rank 0 process will +receive the stream from \fImy_input\fP on stdin. The stdin on all the other +nodes will be tied to /dev/null. However, the stdout from all nodes will +be collected into the \fImy_output\fP file. +. +. +. +.SS Signal Propagation +. +When orte-submit receives a SIGTERM and SIGINT, it will attempt to kill +the entire job by sending all processes in the job a SIGTERM, waiting +a small number of seconds, then sending all processes in the job a +SIGKILL. +. +.PP +SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to +all processes in the job. +. +.PP +A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent +to all of the programs started by ompi-submit and likewise a SIGCONT signal +to ompi-submit will cause a SIGCONT sent. +. +.PP +Other signals are not currently propagated +by orte-submit. +. +. +.SS Process Termination / Signal Handling +. +During the run of an MPI application, if any process dies abnormally +(either exiting before invoking \fIMPI_FINALIZE\fP, or dying as the result of a +signal), \fIompi-submit\fP will print out an error message and kill the rest of the +MPI application. +.PP +User signal handlers should probably avoid trying to cleanup MPI state +(Open MPI is currently not async-signal-safe; see MPI_Init_thread(3) +for details about +.I MPI_THREAD_MULTIPLE +and thread safety). For example, if a segmentation fault occurs in +\fIMPI_SEND\fP (perhaps because a bad buffer was passed in) and a user +signal handler is invoked, if this user handler attempts to invoke +\fIMPI_FINALIZE\fP, Bad Things could happen since Open MPI was already +"in" MPI when the error occurred. Since \fIompi-submit\fP will notice that +the process died due to a signal, it is probably not necessary (and +safest) for the user to only clean up non-MPI state. +. +. +. +.SS Process Environment +. +Processes in the MPI application inherit their environment from the +Open RTE daemon upon the node on which they are running. The +environment is typically inherited from the user's shell. On remote +nodes, the exact environment is determined by the boot MCA module +used. The \fIrsh\fR launch module, for example, uses either +\fIrsh\fR/\fIssh\fR to launch the Open RTE daemon on remote nodes, and +typically executes one or more of the user's shell-setup files before +launching the Open RTE daemon. When running dynamically linked +applications which require the \fILD_LIBRARY_PATH\fR environment +variable to be set, care must be taken to ensure that it is correctly +set when booting Open MPI. +.PP +See the "Remote Execution" section for more details. +. +. +.SS Remote Execution +. +Open MPI requires that the \fIPATH\fR environment variable be set to +find executables on remote nodes (this is typically only necessary in +\fIrsh\fR- or \fIssh\fR-based environments -- batch/scheduled +environments typically copy the current environment to the execution +of remote jobs, so if the current environment has \fIPATH\fR and/or +\fILD_LIBRARY_PATH\fR set properly, the remote nodes will also have it +set properly). If Open MPI was compiled with shared library support, +it may also be necessary to have the \fILD_LIBRARY_PATH\fR environment +variable set on remote nodes as well (especially to find the shared +libraries required to run user MPI applications). +.PP +However, it is not always desirable or possible to edit shell +startup files to set \fIPATH\fR and/or \fILD_LIBRARY_PATH\fR. The +\fI--prefix\fR option is provided for some simple configurations where +this is not possible. +.PP +The \fI--prefix\fR option takes a single argument: the base directory +on the remote node where Open MPI is installed. Open MPI will use +this directory to set the remote \fIPATH\fR and \fILD_LIBRARY_PATH\fR +before executing any Open MPI or user applications. This allows +running Open MPI jobs without having pre-configured the \fIPATH\fR and +\fILD_LIBRARY_PATH\fR on the remote nodes. +.PP +Open MPI adds the basename of the current +node's "bindir" (the directory where Open MPI's executables are +installed) to the prefix and uses that to set the \fIPATH\fR on the +remote node. Similarly, Open MPI adds the basename of the current +node's "libdir" (the directory where Open MPI's libraries are +installed) to the prefix and uses that to set the +\fILD_LIBRARY_PATH\fR on the remote node. For example: +.TP 15 +Local bindir: +/local/node/directory/bin +.TP +Local libdir: +/local/node/directory/lib64 +.PP +If the following command line is used: + + \fB%\fP ompi-submit --prefix /remote/node/directory + +Open MPI will add "/remote/node/directory/bin" to the \fIPATH\fR +and "/remote/node/directory/lib64" to the \fLD_LIBRARY_PATH\fR on the +remote node before attempting to execute anything. +.PP +The \fI--prefix\fR option is not sufficient if the installation paths +on the remote node are different than the local node (e.g., if "/lib" +is used on the local node, but "/lib64" is used on the remote node), +or if the installation paths are something other than a subdirectory +under a common prefix. +.PP +Note that executing \fIompi-submit\fR via an absolute pathname is +equivalent to specifying \fI--prefix\fR without the last subdirectory +in the absolute pathname to \fIompi-submit\fR. For example: + + \fB%\fP /usr/local/bin/ompi-submit ... + +is equivalent to + + \fB%\fP ompi-submit --prefix /usr/local +. +. +. +.SS Exported Environment Variables +. +All environment variables that are named in the form OMPI_* will automatically +be exported to new processes on the local and remote nodes. Environmental +parameters can also be set/forwarded to the new processes using the MCA +parameter \fImca_base_env_list\fP. The \fI\-x\fP option to \fIompi-submit\fP has +been deprecated, but the syntax of the MCA param follows that prior +example. While the syntax of the \fI\-x\fP option and MCA param +allows the definition of new variables, note that the parser +for these options are currently not very sophisticated - it does not even +understand quoted values. Users are advised to set variables in the +environment and use the option to export them; not to define them. +. +. +. +.SS Setting MCA Parameters +. +The \fI-mca\fP switch allows the passing of parameters to various MCA +(Modular Component Architecture) modules. +.\" Open MPI's MCA modules are described in detail in ompimca(7). +MCA modules have direct impact on MPI programs because they allow tunable +parameters to be set at run time (such as which BTL communication device driver +to use, what parameters to pass to that BTL, etc.). +.PP +The \fI-mca\fP switch takes two arguments: \fI\fP and \fI\fP. +The \fI\fP argument generally specifies which MCA module will receive the value. +For example, the \fI\fP "btl" is used to select which BTL to be used for +transporting MPI messages. The \fI\fP argument is the value that is +passed. +For example: +. +.TP 4 +ompi-submit -mca btl tcp,self -np 1 foo +Tells Open MPI to use the "tcp" and "self" BTLs, and to run a single copy of +"foo" an allocated node. +. +.TP +ompi-submit -mca btl self -np 1 foo +Tells Open MPI to use the "self" BTL, and to run a single copy of "foo" an +allocated node. +.\" And so on. Open MPI's BTL MCA modules are described in ompimca_btl(7). +.PP +The \fI-mca\fP switch can be used multiple times to specify different +\fI\fP and/or \fI\fP arguments. If the same \fI\fP is +specified more than once, the \fI\fPs are concatenated with a comma +(",") separating them. +.PP +Note that the \fI-mca\fP switch is simply a shortcut for setting environment variables. +The same effect may be accomplished by setting corresponding environment +variables before running \fIompi-submit\fP. +The form of the environment variables that Open MPI sets is: + + OMPI_MCA_= +.PP +Thus, the \fI-mca\fP switch overrides any previously set environment +variables. The \fI-mca\fP settings similarly override MCA parameters set +in the +$OPAL_PREFIX/etc/openmpi-mca-params.conf or $HOME/.openmpi/mca-params.conf +file. +. +.PP +Unknown \fI\fP arguments are still set as +environment variable -- they are not checked (by \fIompi-submit\fP) for correctness. +Illegal or incorrect \fI\fP arguments may or may not be reported -- it +depends on the specific MCA module. +.PP +To find the available component types under the MCA architecture, or to find the +available parameters for a specific component, use the \fIompi_info\fP command. +See the \fIompi_info(1)\fP man page for detailed information on the command. +. +.SS Running as root +. +The Open MPI team strongly advises against executing +.I ompi-submit +as the root user. MPI applications should be run as regular +(non-root) users. +. +.PP +Reflecting this advice, ompi-submit will refuse to run as root by default. +To override this default, you can add the +.I --allow-run-as-root +option to the +.I ompi-submit +command line. +. +.SS Exit status +. +There is no standard definition for what \fIompi-submit\fP should return as an exit +status. After considerable discussion, we settled on the following method for +assigning the \fIompi-submit\fP exit status (note: in the following description, +the "primary" job is the initial application started by ompi-submit - all jobs that +are spawned by that job are designated "secondary" jobs): +. +.IP \[bu] 2 +if all processes in the primary job normally terminate with exit status 0, we return 0 +.IP \[bu] +if one or more processes in the primary job normally terminate with non-zero exit status, +we return the exit status of the process with the lowest MPI_COMM_WORLD rank to have a non-zero status +.IP \[bu] +if all processes in the primary job normally terminate with exit status 0, and one or more +processes in a secondary job normally terminate with non-zero exit status, we (a) return +the exit status of the process with the lowest MPI_COMM_WORLD rank in the lowest jobid to have a non-zero status, and (b) +output a message summarizing the exit status of the primary and all secondary jobs. +.IP \[bu] +if the cmd line option --report-child-jobs-separately is set, we will return -only- the +exit status of the primary job. Any non-zero exit status in secondary jobs will be +reported solely in a summary print statement. +. +.PP +By default, OMPI records and notes that MPI processes exited with non-zero termination status. +This is generally not considered an "abnormal termination" - i.e., OMPI will not abort an MPI +job if one or more processes return a non-zero status. Instead, the default behavior simply +reports the number of processes terminating with non-zero status upon completion of the job. +.PP +However, in some cases it can be desirable to have the job abort when any process terminates +with non-zero status. For example, a non-MPI job might detect a bad result from a calculation +and want to abort, but doesn't want to generate a core file. Or an MPI job might continue past +a call to MPI_Finalize, but indicate that all processes should abort due to some post-MPI result. +.PP +It is not anticipated that this situation will occur frequently. However, in the interest of +serving the broader community, OMPI now has a means for allowing users to direct that jobs be +aborted upon any process exiting with non-zero status. Setting the MCA parameter +"orte_abort_on_non_zero_status" to 1 will cause OMPI to abort all processes once any process + exits with non-zero status. +.PP +Terminations caused in this manner will be reported on the console as an "abnormal termination", +with the first process to so exit identified along with its exit status. +.PP +. +.\" ************************** +.\" Examples Section +.\" ************************** +.SH EXAMPLES +Be sure also to see the examples throughout the sections above. +. +.TP 4 +ompi-submit -np 4 -mca btl ib,tcp,self prog1 +Run 4 copies of prog1 using the "ib", "tcp", and "self" BTL's for the +transport of MPI messages. +. +. +.TP 4 +ompi-submit -np 4 -mca btl tcp,sm,self +.br +--mca btl_tcp_if_include eth0 prog1 +.br +Run 4 copies of prog1 using the "tcp", "sm" and "self" BTLs for the +transport of MPI messages, with TCP using only the eth0 interface to +communicate. Note that other BTLs have similar if_include MCA +parameters. +. +.\" ************************** +.\" Diagnostics Section +.\" ************************** +. +.\" .SH DIAGNOSTICS +.\" .TP 4 +.\" Error Msg: +.\" Description +. +.\" ************************** +.\" Return Value Section +.\" ************************** +. +.SH RETURN VALUE +. +\fIompi-submit\fP returns 0 if all processes started by \fIompi-submit\fP exit after calling +MPI_FINALIZE. A non-zero value is returned if an internal error occurred in +ompi-submit, or one or more processes exited before calling MPI_FINALIZE. If an +internal error occurred in ompi-submit, the corresponding error code is returned. +In the event that one or more processes exit before calling MPI_FINALIZE, the +return value of the MPI_COMM_WORLD rank of the process that \fIompi-submit\fP first notices died +before calling MPI_FINALIZE will be returned. Note that, in general, this will +be the first process that died but is not guaranteed to be so. +. +.\" ************************** +.\" See Also Section +.\" ************************** +. +.SH SEE ALSO +MPI_Init_thread(3) diff --git a/orte/tools/orte-submit/orte-submit.c b/orte/tools/orte-submit/orte-submit.c new file mode 100644 index 00000000000..db11bef7b1b --- /dev/null +++ b/orte/tools/orte-submit/orte-submit.c @@ -0,0 +1,182 @@ +/* -*- C -*- + * + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2013 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#include +#include + +#include "opal/dss/dss.h" +#include "opal/mca/event/event.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/orted/orted_submit.h" +#include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" + +/* + * Globals + */ +typedef struct { + int status; + volatile bool active; + orte_job_t *jdata; +} orte_submit_status_t; + +static void launched(int index, orte_job_t *jdata, int ret, void *cbdata); +static void completed(int index, orte_job_t *jdata, int ret, void *cbdata); + + +static opal_cmd_line_init_t cmd_line_init[] = { + { "orte_execute_quiet", 'q', NULL, "quiet", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Suppress helpful messages" }, + + { NULL, '\0', "report-pid", "report-pid", 1, + &orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING, + "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, + { NULL, '\0', "report-uri", "report-uri", 1, + &orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING, + "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + + /* exit status reporting */ + { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Return the exit status of the primary job only" }, + + /* select XML output */ + { "orte_xml_output", '\0', "xml", "xml", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Provide all output in XML format" }, + { "orte_xml_file", '\0', "xml-file", "xml-file", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide all output in XML format to the specified file" }, + + { "orte_xterm", '\0', "xterm", "xterm", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Create a new xterm window and display output from the specified ranks there" }, + + /* tell the dvm to terminate */ + { NULL, '\0', "terminate", "terminate", 0, + &orte_cmd_line.terminate_dvm, OPAL_CMD_LINE_TYPE_BOOL, + "Terminate the DVM" }, + + /* End of list */ + { NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + +int main(int argc, char *argv[]) +{ + int rc; + orte_submit_status_t launchst, completest; + opal_cmd_line_t cmd_line; + + memset(&orte_cmd_line, 0, sizeof(orte_cmd_line)); + /* setup our cmd line */ + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + + /* initialize the RTE */ + if (ORTE_SUCCESS != (rc = orte_submit_init(argc, argv, &cmd_line))) { + fprintf(stderr, "Init failed due to duplicate command options\n"); + exit(rc); + } + + /* if this is the terminate command, just send it */ + if (orte_cmd_line.terminate_dvm) { + rc = orte_submit_halt(); + /* just loop the event library - the errmgr + * will exit us when the connection to our + * HNP closes */ + while (1) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + } + + /* launch whatever job we were given */ + memset(&launchst, 0, sizeof(launchst)); + memset(&completest, 0, sizeof(completest)); + launchst.active = true; + completest.active = true; + if (ORTE_SUCCESS != (rc = orte_submit_job(argv, NULL, + launched, &launchst, + completed, &completest))) { + if (ORTE_ERR_OP_IN_PROGRESS == rc) { + /* terminate command was given */ + goto waiting; + } + opal_output(0, "JOB FAILED TO LAUNCH WITH ERROR %d:%s", + rc, ORTE_ERROR_NAME(rc)); + goto DONE; + } + + // wait for response and unpack the status, jobid + while (launchst.active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + if (orte_debug_flag) { + opal_output(0, "Job %s has launched", ORTE_JOBID_PRINT(launchst.jdata->jobid)); + } + if (ORTE_SUCCESS != launchst.status) { + goto DONE; + } + + waiting: + while (completest.active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + + DONE: + /* cleanup and leave */ + orte_submit_finalize(); + + if (orte_debug_flag) { + fprintf(stderr, "exiting with status %d\n", orte_exit_status); + } + exit(orte_exit_status); +} + +static void launched(int index, orte_job_t *jdata, int ret, void *cbdata) +{ + orte_submit_status_t *launchst = (orte_submit_status_t*)cbdata; + launchst->status = ret; + ORTE_UPDATE_EXIT_STATUS(ret); + OBJ_RETAIN(jdata); + launchst->jdata = jdata; + launchst->active = false; +} +static void completed(int index, orte_job_t *jdata, int ret, void *cbdata) +{ + orte_submit_status_t *completest = (orte_submit_status_t*)cbdata; + completest->status = ret; + ORTE_UPDATE_EXIT_STATUS(ret); + OBJ_RETAIN(jdata); + completest->jdata = jdata; + completest->active = false; +} diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 8d5333f8b4e..f6638640829 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -81,13 +81,26 @@ #include "opal/class/opal_pointer_array.h" #include "opal/dss/dss.h" +#include "orte/util/proc_info.h" +#include "orte/util/pre_condition_transports.h" +#include "orte/util/session_dir.h" +#include "orte/util/hnp_contact.h" +#include "orte/util/show_help.h" + #include "orte/mca/dfs/dfs.h" #include "orte/mca/odls/odls.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/ras/ras.h" +#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/schizo/schizo.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/state/state.h" -#include "orte/util/cmd_line.h" -#include "orte/util/proc_info.h" -#include "orte/util/show_help.h" #include "orte/runtime/runtime.h" #include "orte/runtime/orte_globals.h" @@ -100,9 +113,439 @@ #include "orte/orted/orted_submit.h" #include "orterun.h" +/* instance the standard MPIR interfaces */ +#define MPIR_MAX_PATH_LENGTH 512 +#define MPIR_MAX_ARG_LENGTH 1024 +struct MPIR_PROCDESC *MPIR_proctable = NULL; +int MPIR_proctable_size = 0; +volatile int MPIR_being_debugged = 0; +volatile int MPIR_debug_state = 0; +int MPIR_i_am_starter = 0; +int MPIR_partial_attach_ok = 1; +char MPIR_executable_path[MPIR_MAX_PATH_LENGTH] = {0}; +char MPIR_server_arguments[MPIR_MAX_ARG_LENGTH] = {0}; +volatile int MPIR_forward_output = 0; +volatile int MPIR_forward_comm = 0; +char MPIR_attach_fifo[MPIR_MAX_PATH_LENGTH] = {0}; +int MPIR_force_to_main = 0; +static void orte_debugger_dump(void); +static void orte_debugger_init_before_spawn(orte_job_t *jdata); +static void orte_debugger_init_after_spawn(int fd, short event, void *arg); +static void orte_debugger_detached(int fd, short event, void *arg); +static void attach_debugger(int fd, short event, void *arg); +static void build_debugger_args(orte_app_context_t *debugger); +static void open_fifo (void); +static int attach_fd = -1; +static bool fifo_active=false; +static opal_event_t *attach=NULL; + +ORTE_DECLSPEC void* MPIR_Breakpoint(void); + +static void orte_timeout_wakeup(int sd, short args, void *cbdata); + +/* + * Breakpoint function for parallel debuggers + */ +void* MPIR_Breakpoint(void) +{ + return NULL; +} + +/* + * Globals + */ +static char **global_mca_env = NULL; +static orte_std_cntr_t total_num_apps = 0; +static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; +static bool globals_init = false; + +static opal_cmd_line_init_t cmd_line_init[] = { + /* Various "obvious" options */ + { NULL, 'h', NULL, "help", 0, + &orte_cmd_line.help, OPAL_CMD_LINE_TYPE_BOOL, + "This help message" }, + { NULL, 'V', NULL, "version", 0, + &orte_cmd_line.version, OPAL_CMD_LINE_TYPE_BOOL, + "Print version and exit" }, + { NULL, 'v', NULL, "verbose", 0, + &orte_cmd_line.verbose, OPAL_CMD_LINE_TYPE_BOOL, + "Be verbose" }, + { "orte_execute_quiet", 'q', NULL, "quiet", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Suppress helpful messages" }, + { NULL, '\0', "report-pid", "report-pid", 1, + &orte_cmd_line.report_pid, OPAL_CMD_LINE_TYPE_STRING, + "Printout pid on stdout [-], stderr [+], or a file [anything else]" }, + { NULL, '\0', "report-uri", "report-uri", 1, + &orte_cmd_line.report_uri, OPAL_CMD_LINE_TYPE_STRING, + "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + + /* exit status reporting */ + { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Return the exit status of the primary job only" }, + + /* hetero apps */ + { "orte_hetero_apps", '\0', NULL, "hetero-apps", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Indicates that multiple app_contexts are being provided that are a mix of 32/64 bit binaries" }, + + /* select XML output */ + { "orte_xml_output", '\0', "xml", "xml", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Provide all output in XML format" }, + { "orte_xml_file", '\0', "xml-file", "xml-file", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide all output in XML format to the specified file" }, + + /* tag output */ + { "orte_tag_output", '\0', "tag-output", "tag-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Tag all output with [job,rank]" }, + { "orte_timestamp_output", '\0', "timestamp-output", "timestamp-output", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Timestamp all application process output" }, + { "orte_output_filename", '\0', "output-filename", "output-filename", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Redirect output from application processes into filename/job/rank/std[out,err,diag]" }, + { NULL, '\0', "merge-stderr-to-stdout", "merge-stderr-to-stdout", 0, + &orte_cmd_line.merge, OPAL_CMD_LINE_TYPE_BOOL, + "Merge stderr to stdout for each process"}, + { "orte_xterm", '\0', "xterm", "xterm", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Create a new xterm window and display output from the specified ranks there" }, + + /* select stdin option */ + { NULL, '\0', "stdin", "stdin", 1, + &orte_cmd_line.stdin_target, OPAL_CMD_LINE_TYPE_STRING, + "Specify procs to receive stdin [rank, all, none] (default: 0, indicating rank 0)" }, + + /* request that argv[0] be indexed */ + { NULL, '\0', "index-argv-by-rank", "index-argv-by-rank", 0, + &orte_cmd_line.index_argv, OPAL_CMD_LINE_TYPE_BOOL, + "Uniquely index argv[0] for each process using its rank" }, + + /* Specify the launch agent to be used */ + { "orte_launch_agent", '\0', "launch-agent", "launch-agent", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Command used to start processes on remote nodes (default: orted)" }, + + /* Preload the binary on the remote machine */ + { NULL, 's', NULL, "preload-binary", 0, + &orte_cmd_line.preload_binaries, OPAL_CMD_LINE_TYPE_BOOL, + "Preload the binary on the remote machine before starting the remote process." }, + + /* Preload files on the remote machine */ + { NULL, '\0', NULL, "preload-files", 1, + &orte_cmd_line.preload_files, OPAL_CMD_LINE_TYPE_STRING, + "Preload the comma separated list of files to the remote machines current working directory before starting the remote process." }, + +#if OPAL_ENABLE_FT_CR == 1 + /* Tell SStore to preload a snapshot before launch */ + { NULL, '\0', NULL, "sstore-load", 1, + &orte_cmd_line.sstore_load, OPAL_CMD_LINE_TYPE_STRING, + "Internal Use Only! Tell SStore to preload a snapshot before launch." }, +#endif + + /* Use an appfile */ + { NULL, '\0', NULL, "app", 1, + &orte_cmd_line.appfile, OPAL_CMD_LINE_TYPE_STRING, + "Provide an appfile; ignore all other command line options" }, + + /* Number of processes; -c, -n, --n, -np, and --np are all + synonyms */ + { NULL, 'c', "np", "np", 1, + &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + { NULL, '\0', "n", "n", 1, + &orte_cmd_line.num_procs, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* maximum size of VM - typically used to subdivide an allocation */ + { "orte_max_vm_size", '\0', "max-vm-size", "max-vm-size", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of processes to run" }, + + /* Set a hostfile */ + { NULL, '\0', "hostfile", "hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { NULL, '\0', "machinefile", "machinefile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a hostfile" }, + { "orte_default_hostfile", '\0', "default-hostfile", "default-hostfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a default hostfile" }, + { "opal_if_do_not_resolve", '\0', "do-not-resolve", "do-not-resolve", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not attempt to resolve interfaces" }, + + /* uri of PMIx publish/lookup server, or at least where to get it */ + { "pmix_server_uri", '\0', "ompi-server", "ompi-server", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Specify the URI of the publish/lookup server, or the name of the file (specified as file:filename) that contains that info" }, + + { "carto_file_path", '\0', "cf", "cartofile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a cartography file" }, + + { "orte_rankfile", '\0', "rf", "rankfile", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Provide a rankfile file" }, + + /* Export environment variables; potentially used multiple times, + so it does not make sense to set into a variable */ + { NULL, 'x', NULL, NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_NULL, + "Export an environment variable, optionally specifying a value (e.g., \"-x foo\" exports the environment variable foo and takes its value from the current environment; \"-x foo=bar\" exports the environment variable name foo and sets its value to \"bar\" in the started processes)" }, + + /* Mapping controls */ + { "rmaps_base_display_map", '\0', "display-map", "display-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the process map just before launch"}, + { "rmaps_base_display_devel_map", '\0', "display-devel-map", "display-devel-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed process map (mostly intended for developers) just before launch"}, + { "rmaps_base_display_topo_with_map", '\0', "display-topo", "display-topo", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the topology as part of the process map (mostly intended for developers) just before launch"}, + { "rmaps_base_display_diffable_map", '\0', "display-diffable-map", "display-diffable-map", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a diffable process map (mostly intended for developers) just before launch"}, + { NULL, 'H', "host", "host", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of hosts to invoke processes on" }, + { "rmaps_base_no_schedule_local", '\0', "nolocal", "nolocal", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Do not run any MPI applications on the local node" }, + { "rmaps_base_no_oversubscribe", '\0', "nooversubscribe", "nooversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are not to be oversubscribed, even if the system supports such operation"}, + { "rmaps_base_oversubscribe", '\0', "oversubscribe", "oversubscribe", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes are allowed to be oversubscribed, even on a managed system, and overloading of processing elements"}, + { "rmaps_base_cpus_per_rank", '\0', "cpus-per-proc", "cpus-per-proc", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Number of cpus to use for each process [default=1]" }, + { "rmaps_base_cpus_per_rank", '\0', "cpus-per-rank", "cpus-per-rank", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Synonym for cpus-per-proc" }, + + /* backward compatiblity */ + { "rmaps_base_bycore", '\0', "bycore", "bycore", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to map and rank processes round-robin by core" }, + { "rmaps_base_bynode", '\0', "bynode", "bynode", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to map and rank processes round-robin by node" }, + { "rmaps_base_byslot", '\0', "byslot", "byslot", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to map and rank processes round-robin by slot" }, + + /* Nperxxx options that do not require topology and are always + * available - included for backwards compatibility + */ + { "rmaps_ppr_pernode", '\0', "pernode", "pernode", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Launch one process per available node" }, + { "rmaps_ppr_n_pernode", '\0', "npernode", "npernode", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes" }, + { "rmaps_ppr_n_pernode", '\0', "N", NULL, 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per node on all allocated nodes (synonym for npernode)" }, + + /* declare hardware threads as independent cpus */ + { "hwloc_base_use_hwthreads_as_cpus", '\0', "use-hwthread-cpus", "use-hwthread-cpus", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use hardware threads as independent cpus" }, + + /* include npersocket for backwards compatibility */ + { "rmaps_ppr_n_persocket", '\0', "npersocket", "npersocket", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Launch n processes per socket on all allocated nodes" }, + + /* Mapping options */ + { "rmaps_base_mapping_policy", '\0', NULL, "map-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Mapping Policy [slot | hwthread | core | socket (default) | numa | board | node]" }, + + /* Ranking options */ + { "rmaps_base_ranking_policy", '\0', NULL, "rank-by", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Ranking Policy [slot (default) | hwthread | core | socket | numa | board | node]" }, + + /* Binding options */ + { "hwloc_base_binding_policy", '\0', NULL, "bind-to", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Policy for binding processes. Allowed values: none, hwthread, core, l1cache, l2cache, l3cache, socket, numa, board (\"none\" is the default when oversubscribed, \"core\" is the default when np<=2, and \"socket\" is the default when np>2). Allowed qualifiers: overload-allowed, if-supported" }, + + /* backward compatiblity */ + { "hwloc_base_bind_to_core", '\0', "bind-to-core", "bind-to-core", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bind processes to cores" }, + { "hwloc_base_bind_to_socket", '\0', "bind-to-socket", "bind-to-socket", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Bind processes to sockets" }, + + { "hwloc_base_report_bindings", '\0', "report-bindings", "report-bindings", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Whether to report process bindings to stderr" }, + + /* slot list option */ + { "hwloc_base_slot_list", '\0', "slot-list", "slot-list", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "List of processor IDs to bind processes to [default=NULL]"}, + + /* generalized pattern mapping option */ + { "rmaps_ppr_pattern", '\0', NULL, "ppr", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of number of processes on a given resource type [default: none]" }, + + /* Allocation options */ + { "orte_display_alloc", '\0', "display-allocation", "display-allocation", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display the allocation being used by this job"}, + { "orte_display_devel_alloc", '\0', "display-devel-allocation", "display-devel-allocation", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Display a detailed list (mostly intended for developers) of the allocation being used by this job"}, + { "hwloc_base_cpu_set", '\0', "cpu-set", "cpu-set", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of ranges specifying logical cpus allocated to this job [default: none]"}, + + /* mpiexec-like arguments */ + { NULL, '\0', "wdir", "wdir", 1, + &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Set the working directory of the started processes" }, + { NULL, '\0', "wd", "wd", 1, + &orte_cmd_line.wdir, OPAL_CMD_LINE_TYPE_STRING, + "Synonym for --wdir" }, + { NULL, '\0', "set-cwd-to-session-dir", "set-cwd-to-session-dir", 0, + &orte_cmd_line.set_cwd_to_session_dir, OPAL_CMD_LINE_TYPE_BOOL, + "Set the working directory of the started processes to their session directory" }, + { NULL, '\0', "path", "path", 1, + &orte_cmd_line.path, OPAL_CMD_LINE_TYPE_STRING, + "PATH to be used to look for executables to start processes" }, + + /* User-level debugger arguments */ + { NULL, '\0', "tv", "tv", 0, + &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, + "Deprecated backwards compatibility flag; synonym for \"--debug\"" }, + { NULL, '\0', "debug", "debug", 0, + &orte_cmd_line.debugger, OPAL_CMD_LINE_TYPE_BOOL, + "Invoke the user-level debugger indicated by the orte_base_user_debugger MCA parameter" }, + { "orte_base_user_debugger", '\0', "debugger", "debugger", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Sequence of debuggers to search for when \"--debug\" is used" }, + { "orte_output_debugger_proctable", '\0', "output-proctable", "output-proctable", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Output the debugger proctable after launch" }, + + /* OpenRTE arguments */ + { "orte_debug", 'd', "debug-devel", "debug-devel", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { "orte_debug_daemons", '\0', "debug-daemons", "debug-daemons", 0, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Enable debugging of any OpenRTE daemons used by this application" }, + + { "orte_debug_daemons_file", '\0', "debug-daemons-file", "debug-daemons-file", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of any OpenRTE daemons used by this application, storing output in files" }, + + { "orte_leave_session_attached", '\0', "leave-session-attached", "leave-session-attached", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable debugging of OpenRTE" }, + + { "orte_do_not_launch", '\0', "do-not-launch", "do-not-launch", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Perform all necessary operations to prepare to launch the application, but do not actually launch it" }, + + { NULL, '\0', NULL, "prefix", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Prefix where Open MPI is installed on remote nodes" }, + { NULL, '\0', NULL, "noprefix", 0, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Disable automatic --prefix behavior" }, + + { "orte_report_launch_progress", '\0', "show-progress", "show-progress", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Output a brief periodic report on launch progress" }, + + { "orte_use_regexp", '\0', "use-regexp", "use-regexp", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Use regular expressions for launch" }, + + { "orte_report_events", '\0', "report-events", "report-events", 1, + NULL, OPAL_CMD_LINE_TYPE_STRING, + "Report events to a tool listening at the specified URI" }, + + { "orte_enable_recovery", '\0', "enable-recovery", "enable-recovery", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable recovery from process failure [Default = disabled]" }, + + { "orte_max_restarts", '\0', "max-restarts", "max-restarts", 1, + NULL, OPAL_CMD_LINE_TYPE_INT, + "Max number of times to restart a failed process" }, + + { "orte_hetero_nodes", '\0', NULL, "hetero-nodes", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Nodes in cluster may differ in topology, so send the topology back from each node [Default = false]" }, + +#if OPAL_ENABLE_CRDEBUG == 1 + { "opal_cr_enable_crdebug", '\0', "crdebug", "crdebug", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Enable C/R Debugging" }, +#endif + + { NULL, '\0', "disable-recovery", "disable-recovery", 0, + &orte_cmd_line.disable_recovery, OPAL_CMD_LINE_TYPE_BOOL, + "Disable recovery (resets all recovery options to off)" }, + + { "state_novm_select", '\0', "novm", "novm", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Execute without creating an allocation-spanning virtual machine (only start daemons on nodes hosting application procs)" }, + + { "orte_staged_execution", '\0', "staged", "staged", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Used staged execution if inadequate resources are present (cannot support MPI jobs)" }, + + { NULL, '\0', "allow-run-as-root", "allow-run-as-root", 0, + &orte_cmd_line.run_as_root, OPAL_CMD_LINE_TYPE_BOOL, + "Allow execution as root (STRONGLY DISCOURAGED)" }, + + { NULL, '\0', "personality", "personality", 1, + &orte_cmd_line.personality, OPAL_CMD_LINE_TYPE_STRING, + "Comma-separated list of programming model, languages, and containers being used (default=\"ompi\")" }, + + { NULL, '\0', "dvm", "dvm", 0, + &orte_cmd_line.create_dvm, OPAL_CMD_LINE_TYPE_BOOL, + "Create a persistent distributed virtual machine (DVM)" }, + + /* End of list */ + { NULL, '\0', NULL, NULL, 0, + NULL, OPAL_CMD_LINE_TYPE_NULL, NULL } +}; + /* local data */ static opal_list_t job_stack; +/* + * Local functions + */ +static int create_app(int argc, char* argv[], + orte_job_t *jdata, + orte_app_context_t **app, + bool *made_app, char ***app_env); +static int init_globals(void); +static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line); +static int parse_locals(orte_job_t *jdata, int argc, char* argv[]); +static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile); +static int parse_appfile(orte_job_t *jdata, char *filename, char ***env); +static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, + int argc, char *argv[], int num_procs) __opal_attribute_noreturn__; + static void spawn_next_job(opal_buffer_t *bptr, void *cbdata) { orte_job_t *jdata = (orte_job_t*)cbdata; @@ -146,11 +589,63 @@ static void run_next_job(int fd, short args, void *cbdata) int orterun(int argc, char *argv[]) { + int rc; + opal_cmd_line_t cmd_line; + char *param; + orte_job_t *daemons; + orte_app_context_t *app, *dapp; + orte_job_t *jdata=NULL, *jptr; +#if OPAL_ENABLE_FT_CR == 1 + char *tmp_env_var = NULL; +#endif + + /* find our basename (the name of the executable) so that we can + use it in pretty-print error messages */ + orte_basename = opal_basename(argv[0]); - if (ORTE_SUCCESS != orte_submit_init(argc, argv, NULL)) { + /* bozo check - we don't allow recursive calls of orterun */ + if (NULL != getenv("OMPI_UNIVERSE_SIZE")) { + fprintf(stderr, "\n\n**********************************************************\n\n"); + fprintf(stderr, "Open MPI does not support recursive calls of %s\n", orte_basename); + fprintf(stderr, "\n**********************************************************\n"); exit(1); } + /* Setup and parse the command line */ + init_globals(); + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true, + argc, argv)) ) { + if (OPAL_ERR_SILENT != rc) { + fprintf(stderr, "%s: command line error (%s)\n", argv[0], + opal_strerror(rc)); + } + return rc; + } + + /* print version if requested. Do this before check for help so + that --version --help works as one might expect. */ + if (orte_cmd_line.version) { + char *str, *project_name = NULL; + if (0 == strcmp(orte_basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + str = opal_info_make_version_str("all", + OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION, + OPAL_GREEK_VERSION, + OPAL_REPO_REV); + if (NULL != str) { + fprintf(stdout, "%s (%s) %s\n\nReport bugs to %s\n", + orte_basename, project_name, str, PACKAGE_BUGREPORT); + free(str); + } + exit(0); + } + /* check if we are running as root - if we are, then only allow * us to proceed if the allow-run-as-root flag was given. Otherwise, * exit with a giant warning flag @@ -173,6 +668,354 @@ int orterun(int argc, char *argv[]) exit(1); } + /* + * Since this process can now handle MCA/GMCA parameters, make sure to + * process them - we can do this step WITHOUT first calling opal_init + */ + if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) { + exit(1); + } + + /* Ensure that enough of OPAL is setup for us to be able to run */ + /* + * NOTE: (JJH) + * We need to allow 'mca_base_cmd_line_process_args()' to process command + * line arguments *before* calling opal_init_util() since the command + * line could contain MCA parameters that affect the way opal_init_util() + * functions. AMCA parameters are one such option normally received on the + * command line that affect the way opal_init_util() behaves. + * It is "safe" to call mca_base_cmd_line_process_args() before + * opal_init_util() since mca_base_cmd_line_process_args() does *not* + * depend upon opal_init_util() functionality. + */ + /* Need to initialize OPAL so that install_dirs are filled in */ + if (OPAL_SUCCESS != opal_init(&argc, &argv)) { + exit(1); + } + + /* Check for help request */ + if (orte_cmd_line.help) { + char *str, *args = NULL; + char *project_name = NULL; + if (0 == strcmp(orte_basename, "mpirun")) { + project_name = "Open MPI"; + } else { + project_name = "OpenRTE"; + } + args = opal_cmd_line_get_usage_msg(&cmd_line); + str = opal_show_help_string("help-orterun.txt", "orterun:usage", false, + orte_basename, project_name, OPAL_VERSION, + orte_basename, args, + PACKAGE_BUGREPORT); + if (NULL != str) { + printf("%s", str); + free(str); + } + free(args); + + /* If someone asks for help, that should be all we do */ + opal_finalize(); + exit(0); + } + + /* may look strange, but the way we handle prefix is a little weird + * and probably needs to be addressed more fully at some future point. + * For now, we have a conflict between app_files and cmd line usage. + * Since app_files are used by the C/R system, we will make an + * adjustment here to avoid perturbing that system. + * + * We cannot just have the cmd line parser place any found value + * in the global struct as the app_file parser would replace it. + * So handle this specific cmd line option manually. + */ + orte_cmd_line.prefix = NULL; + orte_cmd_line.path_to_mpirun = NULL; + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || + '/' == argv[0][0] || want_prefix_by_default) { + size_t param_len; + if ('/' == argv[0][0]) { + char* tmp_basename = NULL; + /* If they specified an absolute path, strip off the + /bin/" and leave just the prefix */ + orte_cmd_line.path_to_mpirun = opal_dirname(argv[0]); + /* Quick sanity check to ensure we got + something/bin/ and that the installation + tree is at least more or less what we expect it to + be */ + tmp_basename = opal_basename(orte_cmd_line.path_to_mpirun); + if (0 == strcmp("bin", tmp_basename)) { + char* tmp = orte_cmd_line.path_to_mpirun; + orte_cmd_line.path_to_mpirun = opal_dirname(tmp); + free(tmp); + } else { + free(orte_cmd_line.path_to_mpirun); + orte_cmd_line.path_to_mpirun = NULL; + } + free(tmp_basename); + } + /* if both are given, check to see if they match */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && NULL != orte_cmd_line.path_to_mpirun) { + char *tmp_basename; + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + /* ensure we strip any trailing '/' */ + if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { + param[strlen(param)-1] = '\0'; + } + tmp_basename = strdup(orte_cmd_line.path_to_mpirun); + if (0 == strcmp(OPAL_PATH_SEP, &(tmp_basename[strlen(tmp_basename)-1]))) { + tmp_basename[strlen(tmp_basename)-1] = '\0'; + } + if (0 != strcmp(param, tmp_basename)) { + orte_show_help("help-orterun.txt", "orterun:double-prefix", + true, orte_basename, orte_basename, + param, tmp_basename, orte_basename); + /* use the prefix over the path-to-mpirun so that + * people can specify the backend prefix as different + * from the local one + */ + free(orte_cmd_line.path_to_mpirun); + orte_cmd_line.path_to_mpirun = NULL; + } + free(tmp_basename); + } else if (NULL != orte_cmd_line.path_to_mpirun) { + param = strdup(orte_cmd_line.path_to_mpirun); + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + } else { + /* --enable-orterun-prefix-default was given to orterun */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, orte_basename, orte_basename); + free(param); + return ORTE_ERR_FATAL; + } + } + + orte_cmd_line.prefix = param; + } + want_prefix_by_default = true; + } + + /* flag that I am the HNP - needs to be done prior to + * registering params + */ + orte_process_info.proc_type = ORTE_PROC_HNP; + + /* Setup MCA params */ + orte_register_params(); + + /* save the environment for launch purposes. This MUST be + * done so that we can pass it to any local procs we + * spawn - otherwise, those local procs won't see any + * non-MCA envars were set in the enviro prior to calling + * orterun + */ + orte_launch_environ = opal_argv_copy(environ); + opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ); + opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ); + + /* Intialize our Open RTE environment + * Set the flag telling orte_init that I am NOT a + * singleton, but am "infrastructure" - prevents setting + * up incorrect infrastructure that only a singleton would + * require + */ + if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) { + /* cannot call ORTE_ERROR_LOG as it could be the errmgr + * never got loaded! + */ + return rc; + } + /* finalize OPAL. As it was opened again from orte_init->opal_init + * we continue to have a reference count on it. So we have to finalize it twice... + */ + opal_finalize(); + + /* default our personality to OMPI */ + if (NULL == orte_cmd_line.personality) { + opal_argv_append_nosize(&orte_cmd_line.personalities, "ompi"); + } else { + orte_cmd_line.personalities = opal_argv_split(orte_cmd_line.personality, ','); + } + /* Check for some "global" command line params */ + parse_globals(argc, argv, &cmd_line); + OBJ_DESTRUCT(&cmd_line); + + /* create a new job object to hold the info for this one - the + * jobid field will be filled in by the PLM when the job is + * launched + */ + jdata = OBJ_NEW(orte_job_t); + if (NULL == jdata) { + /* cannot call ORTE_ERROR_LOG as the errmgr + * hasn't been loaded yet! + */ + return ORTE_ERR_OUT_OF_RESOURCE; + } + jdata->personality = opal_argv_copy(orte_cmd_line.personalities); + + /* check what user wants us to do with stdin */ + if (0 == strcmp(orte_cmd_line.stdin_target, "all")) { + jdata->stdin_target = ORTE_VPID_WILDCARD; + } else if (0 == strcmp(orte_cmd_line.stdin_target, "none")) { + jdata->stdin_target = ORTE_VPID_INVALID; + } else { + jdata->stdin_target = strtoul(orte_cmd_line.stdin_target, NULL, 10); + } + + /* if we want the argv's indexed, indicate that */ + if (orte_cmd_line.index_argv) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_INDEX_ARGV, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } + + /* Parse each app, adding it to the job object */ + parse_locals(jdata, argc, argv); + + if (0 == jdata->num_apps) { + /* This should never happen -- this case should be caught in + create_app(), but let's just double check... */ + orte_show_help("help-orterun.txt", "orterun:nothing-to-do", + true, orte_basename); + exit(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + +#if OPAL_ENABLE_FT_CR == 1 + /* Disable OPAL CR notifications for this tool */ + opal_cr_set_enabled(false); + (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var); + opal_setenv(tmp_env_var, + "1", + true, &environ); + free(tmp_env_var); +#endif + + /* get the daemon job object */ + daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + + /* check for request to report uri */ + if (NULL != orte_cmd_line.report_uri) { + FILE *fp; + char *rml_uri; + rml_uri = orte_rml.get_contact_info(); + if (0 == strcmp(orte_cmd_line.report_uri, "-")) { + /* if '-', then output to stdout */ + printf("%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + } else if (0 == strcmp(orte_cmd_line.report_uri, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + } else { + fp = fopen(orte_cmd_line.report_uri, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + orte_basename, "uri", orte_cmd_line.report_uri); + exit(0); + } + fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); + fclose(fp); + } + if (NULL != rml_uri) { + free(rml_uri); + } + } + + /* If we have a prefix, then modify the PATH and + LD_LIBRARY_PATH environment variables in our copy. This + will ensure that any locally-spawned children will + have our executables and libraries in their path + + For now, default to the prefix_dir provided in the first app_context. + Since there always MUST be at least one app_context, we are safe in + doing this. + */ + param = NULL; + if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0)) && + orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING)) { + char *oldenv, *newenv, *lib_base, *bin_base; + + /* copy the prefix into the daemon job so that any launcher + * can find the orteds when we launch the virtual machine + */ + if (NULL == (dapp = (orte_app_context_t*)opal_pointer_array_get_item(daemons->apps, 0))) { + /* that's an error in the ess */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + orte_set_attribute(&dapp->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_LOCAL, param, OPAL_STRING); + + lib_base = opal_basename(opal_install_dirs.libdir); + bin_base = opal_basename(opal_install_dirs.bindir); + + /* Reset PATH */ + newenv = opal_os_path( false, param, bin_base, NULL ); + oldenv = getenv("PATH"); + if (NULL != oldenv) { + char *temp; + asprintf(&temp, "%s:%s", newenv, oldenv ); + free( newenv ); + newenv = temp; + } + opal_setenv("PATH", newenv, true, &orte_launch_environ); + if (orte_debug_flag) { + opal_output(0, "%s: reset PATH: %s", orte_basename, newenv); + } + free(newenv); + free(bin_base); + + /* Reset LD_LIBRARY_PATH */ + newenv = opal_os_path( false, param, lib_base, NULL ); + oldenv = getenv("LD_LIBRARY_PATH"); + if (NULL != oldenv) { + char* temp; + asprintf(&temp, "%s:%s", newenv, oldenv); + free(newenv); + newenv = temp; + } + opal_setenv("LD_LIBRARY_PATH", newenv, true, &orte_launch_environ); + if (orte_debug_flag) { + opal_output(0, "%s: reset LD_LIBRARY_PATH: %s", + orte_basename, newenv); + } + free(newenv); + free(lib_base); + free(param); + } + + /* pre-condition any network transports that require it */ + if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(jdata))) { + ORTE_ERROR_LOG(rc); + orte_show_help("help-orterun.txt", "orterun:precondition", false, + orte_basename, NULL, NULL, rc); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; + } + + /* if we were asked to tag output, mark it so */ + if (orte_tag_output) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_TAG_OUTPUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } + /* if we were asked to timestamp output, mark it so */ + if (orte_timestamp_output) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_TIMESTAMP_OUTPUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } + /* if we were asked to output to files, pass it along */ + if (NULL != orte_output_filename) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_OUTPUT_TO_FILE, ORTE_ATTR_GLOBAL, orte_output_filename, OPAL_STRING); + } + /* if we were asked to merge stderr to stdout, mark it so */ + if (orte_cmd_line.merge) { + orte_set_attribute(&jdata->attributes, ORTE_JOB_MERGE_STDERR_STDOUT, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } /* setup to listen for commands sent specifically to me, even though I would probably * be the one sending them! Unfortunately, since I am a participating daemon, * there are times I need to send a command to "all daemons", and that means *I* have @@ -181,16 +1024,15 @@ int orterun(int argc, char *argv[]) orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); - /* spawn the job and its daemons */ - if (ORTE_SUCCESS != orte_submit_job(argv, NULL, - NULL, NULL, - NULL, NULL)) { - ORTE_UPDATE_EXIT_STATUS(1); - goto DONE; - } - + /* setup for debugging */ + orte_debugger_init_before_spawn(jdata); + orte_state.add_job_state(ORTE_JOB_STATE_READY_FOR_DEBUGGERS, + orte_debugger_init_after_spawn, + ORTE_SYS_PRI); + orte_state.add_job_state(ORTE_JOB_STATE_DEBUGGER_DETACH, + orte_debugger_detached, + ORTE_SYS_PRI); -#if 0 if (orte_staged_execution) { /* staged execution is requested - each app_context * is treated as a separate job and executed in @@ -223,7 +1065,34 @@ int orterun(int argc, char *argv[]) goto DONE; } } -#endif + + /* check for suicide test directives */ + if (NULL != getenv("ORTE_TEST_HNP_SUICIDE") || + NULL != getenv("ORTE_TEST_ORTED_SUICIDE")) { + /* don't forward IO from this process so we can + * see any debug after daemon termination */ + ORTE_FLAG_UNSET(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT); + } + + /* check for a job timeout specification, to be provided in seconds + * as that is what MPICH used + */ + if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { + if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE); + goto DONE; + } + orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10); + orte_mpiexec_timeout->tv.tv_usec = 0; + opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev, + orte_timeout_wakeup, jdata); + opal_event_set_priority(orte_mpiexec_timeout->ev, ORTE_ERROR_PRI); + opal_event_evtimer_add(orte_mpiexec_timeout->ev, &orte_mpiexec_timeout->tv); + } + + /* spawn the job and its daemons */ + rc = orte_plm.spawn(jdata); /* loop the event lib until an exit event is detected */ while (orte_event_base_active) { @@ -235,12 +1104,12 @@ int orterun(int argc, char *argv[]) DONE: /* if it was created, remove the debugger attach fifo */ - if (0 <= orte_debugger_attach_fd) { - if (orte_debugger_fifo_active) { - opal_event_del(orte_debugger_attach); - free(orte_debugger_attach); + if (0 <= attach_fd) { + if (fifo_active) { + opal_event_del(attach); + free(attach); } - close(orte_debugger_attach_fd); + close(attach_fd); unlink(MPIR_attach_fifo); } @@ -255,3 +1124,1746 @@ int orterun(int argc, char *argv[]) } exit(orte_exit_status); } + +static int init_globals(void) +{ + /* Only CONSTRUCT things once */ + if (!globals_init) { + orte_cmd_line.env_val = NULL; + orte_cmd_line.appfile = NULL; + orte_cmd_line.wdir = NULL; + orte_cmd_line.path = NULL; + orte_cmd_line.stdin_target = "0"; + orte_cmd_line.report_pid = NULL; + orte_cmd_line.report_uri = NULL; + orte_cmd_line.disable_recovery = false; + orte_cmd_line.index_argv = false; + orte_cmd_line.run_as_root = false; + orte_cmd_line.personality = NULL; + orte_cmd_line.personalities = NULL; + orte_cmd_line.create_dvm = false; + } + + /* Reset the other fields every time */ + + orte_cmd_line.help = false; + orte_cmd_line.version = false; + orte_cmd_line.verbose = false; + orte_cmd_line.debugger = false; + orte_cmd_line.num_procs = 0; + if( NULL != orte_cmd_line.env_val ) + free( orte_cmd_line.env_val ); + orte_cmd_line.env_val = NULL; + if( NULL != orte_cmd_line.appfile ) + free( orte_cmd_line.appfile ); + orte_cmd_line.appfile = NULL; + if( NULL != orte_cmd_line.wdir ) + free( orte_cmd_line.wdir ); + orte_cmd_line.set_cwd_to_session_dir = false; + orte_cmd_line.wdir = NULL; + if( NULL != orte_cmd_line.path ) + free( orte_cmd_line.path ); + orte_cmd_line.path = NULL; + + orte_cmd_line.preload_binaries = false; + orte_cmd_line.preload_files = NULL; + +#if OPAL_ENABLE_FT_CR == 1 + orte_cmd_line.sstore_load = NULL; +#endif + + /* All done */ + globals_init = true; + return ORTE_SUCCESS; +} + + +static int parse_globals(int argc, char* argv[], opal_cmd_line_t *cmd_line) +{ + /* check for request to report pid */ + if (NULL != orte_cmd_line.report_pid) { + FILE *fp; + if (0 == strcmp(orte_cmd_line.report_pid, "-")) { + /* if '-', then output to stdout */ + printf("%d\n", (int)getpid()); + } else if (0 == strcmp(orte_cmd_line.report_pid, "+")) { + /* if '+', output to stderr */ + fprintf(stderr, "%d\n", (int)getpid()); + } else { + fp = fopen(orte_cmd_line.report_pid, "w"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:write_file", false, + orte_basename, "pid", orte_cmd_line.report_pid); + exit(0); + } + fprintf(fp, "%d\n", (int)getpid()); + fclose(fp); + } + } + + /* Do we want a user-level debugger? */ + + if (orte_cmd_line.debugger) { + run_debugger(orte_basename, cmd_line, argc, argv, orte_cmd_line.num_procs); + } + + /* if recovery was disabled on the cmd line, do so */ + if (orte_cmd_line.disable_recovery) { + orte_enable_recovery = false; + orte_max_restarts = 0; + } + + return ORTE_SUCCESS; +} + + +static int parse_locals(orte_job_t *jdata, int argc, char* argv[]) +{ + int i, rc, app_num; + int temp_argc; + char **temp_argv, **env; + orte_app_context_t *app; + bool made_app; + orte_std_cntr_t j, size1; + + /* Make the apps */ + temp_argc = 0; + temp_argv = NULL; + opal_argv_append(&temp_argc, &temp_argv, argv[0]); + + /* NOTE: This bogus env variable is necessary in the calls to + create_app(), below. See comment immediately before the + create_app() function for an explanation. */ + + env = NULL; + for (app_num = 0, i = 1; i < argc; ++i) { + if (0 == strcmp(argv[i], ":")) { + /* Make an app with this argv */ + if (opal_argv_count(temp_argv) > 1) { + if (NULL != env) { + opal_argv_free(env); + env = NULL; + } + app = NULL; + rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env); + /** keep track of the number of apps - point this app_context to that index */ + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been + printed; no need to cleanup -- we can just + exit */ + exit(1); + } + if (made_app) { + app->idx = app_num; + ++app_num; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + if (ORTE_SUCCESS != (rc = orte_schizo.setup_app(jdata->personality, app))) { + return rc; + } + } + + /* Reset the temps */ + + temp_argc = 0; + temp_argv = NULL; + opal_argv_append(&temp_argc, &temp_argv, argv[0]); + } + } else { + opal_argv_append(&temp_argc, &temp_argv, argv[i]); + } + } + + if (opal_argv_count(temp_argv) > 1) { + app = NULL; + rc = create_app(temp_argc, temp_argv, jdata, &app, &made_app, &env); + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been printed; + no need to cleanup -- we can just exit */ + exit(1); + } + if (made_app) { + app->idx = app_num; + ++app_num; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + if (ORTE_SUCCESS != (rc = orte_schizo.setup_app(jdata->personality, app))) { + return rc; + } + } + } + if (NULL != env) { + opal_argv_free(env); + } + opal_argv_free(temp_argv); + + /* Once we've created all the apps, add the global MCA params to + each app's environment (checking for duplicates, of + course -- yay opal_environ_merge()). */ + + if (NULL != global_mca_env) { + size1 = (size_t)opal_pointer_array_get_size(jdata->apps); + /* Iterate through all the apps */ + for (j = 0; j < size1; ++j) { + app = (orte_app_context_t *) + opal_pointer_array_get_item(jdata->apps, j); + if (NULL != app) { + /* Use handy utility function */ + env = opal_environ_merge(global_mca_env, app->env); + opal_argv_free(app->env); + app->env = env; + } + } + } + + /* Now take a subset of the MCA params and set them as MCA + overrides here in orterun (so that when we orte_init() later, + all the components see these MCA params). Here's how we decide + which subset of the MCA params we set here in orterun: + + 1. If any global MCA params were set, use those + 2. If no global MCA params were set and there was only one app, + then use its app MCA params + 3. Otherwise, don't set any + */ + + env = NULL; + if (NULL != global_mca_env) { + env = global_mca_env; + } else { + if (opal_pointer_array_get_size(jdata->apps) >= 1) { + /* Remember that pointer_array's can be padded with NULL + entries; so only use the app's env if there is exactly + 1 non-NULL entry */ + app = (orte_app_context_t *) + opal_pointer_array_get_item(jdata->apps, 0); + if (NULL != app) { + env = app->env; + for (j = 1; j < opal_pointer_array_get_size(jdata->apps); ++j) { + if (NULL != opal_pointer_array_get_item(jdata->apps, j)) { + env = NULL; + break; + } + } + } + } + } + + if (NULL != env) { + size1 = opal_argv_count(env); + for (j = 0; j < size1; ++j) { + /* Use-after-Free error possible here. putenv does not copy + * the string passed to it, and instead stores only the pointer. + * env[j] may be freed later, in which case the pointer + * in environ will now be left dangling into a deallocated + * region. + * So we make a copy of the variable. + */ + char *s = strdup(env[j]); + + if (NULL == s) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + putenv(s); + } + } + + /* All done */ + + return ORTE_SUCCESS; +} + + +/* + * This function takes a "char ***app_env" parameter to handle the + * specific case: + * + * orterun --mca foo bar -app appfile + * + * That is, we'll need to keep foo=bar, but the presence of the app + * file will cause an invocation of parse_appfile(), which will cause + * one or more recursive calls back to create_app(). Since the + * foo=bar value applies globally to all apps in the appfile, we need + * to pass in the "base" environment (that contains the foo=bar value) + * when we parse each line in the appfile. + * + * This is really just a special case -- when we have a simple case like: + * + * orterun --mca foo bar -np 4 hostname + * + * Then the upper-level function (parse_locals()) calls create_app() + * with a NULL value for app_env, meaning that there is no "base" + * environment that the app needs to be created from. + */ +static int create_app(int argc, char* argv[], + orte_job_t *jdata, + orte_app_context_t **app_ptr, + bool *made_app, char ***app_env) +{ + opal_cmd_line_t cmd_line; + char cwd[OPAL_PATH_MAX]; + int i, j, count, rc; + char *param, *value; + orte_app_context_t *app = NULL; + bool cmd_line_made = false; + bool found = false; + char *appname; + + *made_app = false; + + /* Pre-process the command line if we are going to parse an appfile later. + * save any mca command line args so they can be passed + * separately to the daemons. + * Use Case: + * $ cat launch.appfile + * -np 1 -mca aaa bbb ./my-app -mca ccc ddd + * -np 1 -mca aaa bbb ./my-app -mca eee fff + * $ mpirun -np 2 -mca foo bar --app launch.appfile + * Only pick up '-mca foo bar' on this pass. + */ + if (NULL != orte_cmd_line.appfile) { + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, argc, 0, argv))) { + goto cleanup; + } + } + + /* Parse application command line options. */ + + init_globals(); + opal_cmd_line_create(&cmd_line, cmd_line_init); + mca_base_cmd_line_setup(&cmd_line); + cmd_line_made = true; + rc = opal_cmd_line_parse(&cmd_line, true, argc, argv); + if (ORTE_SUCCESS != rc) { + goto cleanup; + } + mca_base_cmd_line_process_args(&cmd_line, app_env, &global_mca_env); + + /* Is there an appfile in here? */ + + if (NULL != orte_cmd_line.appfile) { + OBJ_DESTRUCT(&cmd_line); + return parse_appfile(jdata, strdup(orte_cmd_line.appfile), app_env); + } + + /* Setup application context */ + + app = OBJ_NEW(orte_app_context_t); + opal_cmd_line_get_tail(&cmd_line, &count, &app->argv); + + /* See if we have anything left */ + + if (0 == count) { + orte_show_help("help-orterun.txt", "orterun:executable-not-specified", + true, orte_basename, orte_basename); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + + /* + * Get mca parameters so we can pass them to the daemons. + * Use the count determined above to make sure we do not go past + * the executable name. Example: + * mpirun -np 2 -mca foo bar ./my-app -mca bip bop + * We want to pick up '-mca foo bar' but not '-mca bip bop' + */ + if (ORTE_SUCCESS != (rc = orte_schizo.parse_cli(orte_cmd_line.personalities, + argc, count, argv))) { + goto cleanup; + } + + /* Grab all OMPI_* environment variables */ + + app->env = opal_argv_copy(*app_env); + if (ORTE_SUCCESS != (rc = orte_schizo.parse_env(orte_cmd_line.personalities, + orte_cmd_line.path, + &cmd_line, + environ, &app->env))) { + goto cleanup; + } + + + /* Did the user request a specific wdir? */ + + if (NULL != orte_cmd_line.wdir) { + /* if this is a relative path, convert it to an absolute path */ + if (opal_path_is_absolute(orte_cmd_line.wdir)) { + app->cwd = strdup(orte_cmd_line.wdir); + } else { + /* get the cwd */ + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + goto cleanup; + } + /* construct the absolute path */ + app->cwd = opal_os_path(false, cwd, orte_cmd_line.wdir, NULL); + } + orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } else if (orte_cmd_line.set_cwd_to_session_dir) { + orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + orte_set_attribute(&app->attributes, ORTE_APP_USER_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } else { + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + goto cleanup; + } + app->cwd = strdup(cwd); + } + + /* if this is the first app_context, check for prefix directions. + * We only do this for the first app_context because the launchers + * only look at the first one when setting the prefix - we do NOT + * support per-app_context prefix settings! + */ + if (0 == total_num_apps) { + /* Check to see if the user explicitly wanted to disable automatic + --prefix behavior */ + + if (opal_cmd_line_is_taken(&cmd_line, "noprefix")) { + want_prefix_by_default = false; + } + + /* Did the user specify a prefix, or want prefix by default? */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) { + size_t param_len; + /* if both the prefix was given and we have a prefix + * given above, check to see if they match + */ + if (opal_cmd_line_is_taken(&cmd_line, "prefix") && + NULL != orte_cmd_line.prefix) { + /* if they don't match, then that merits a warning */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + /* ensure we strip any trailing '/' */ + if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) { + param[strlen(param)-1] = '\0'; + } + value = strdup(orte_cmd_line.prefix); + if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) { + value[strlen(value)-1] = '\0'; + } + if (0 != strcmp(param, value)) { + orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict", + true, orte_basename, value, param); + /* let the global-level prefix take precedence since we + * know that one is being used + */ + free(param); + param = strdup(orte_cmd_line.prefix); + } + free(value); + } else if (NULL != orte_cmd_line.prefix) { + param = strdup(orte_cmd_line.prefix); + } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){ + /* must be --prefix alone */ + param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0)); + } else { + /* --enable-orterun-prefix-default was given to orterun */ + param = strdup(opal_install_dirs.prefix); + } + + if (NULL != param) { + /* "Parse" the param, aka remove superfluous path_sep. */ + param_len = strlen(param); + while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) { + param[param_len-1] = '\0'; + param_len--; + if (0 == param_len) { + orte_show_help("help-orterun.txt", "orterun:empty-prefix", + true, orte_basename, orte_basename); + free(param); + return ORTE_ERR_FATAL; + } + } + orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING); + free(param); + } + } + } + + /* Did the user specify a hostfile. Need to check for both + * hostfile and machine file. + * We can only deal with one hostfile per app context, otherwise give an error. + */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) { + if(1 < j) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, orte_basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0); + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); + } + } + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) { + if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) { + orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles", + true, orte_basename, NULL); + return ORTE_ERR_FATAL; + } else { + value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0); + orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING); + } + } + + /* Did the user specify any hosts? */ + if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) { + char **targ=NULL, *tval; + for (i = 0; i < j; ++i) { + value = opal_cmd_line_get_param(&cmd_line, "host", i, 0); + opal_argv_append_nosize(&targ, value); + } + tval = opal_argv_join(targ, ','); + orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING); + opal_argv_free(targ); + free(tval); + } else if (NULL != orte_default_dash_host) { + orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, + orte_default_dash_host, OPAL_STRING); + } + + /* check for bozo error */ + if (0 > orte_cmd_line.num_procs) { + orte_show_help("help-orterun.txt", "orterun:negative-nprocs", + true, orte_basename, app->argv[0], + orte_cmd_line.num_procs, NULL); + return ORTE_ERR_FATAL; + } + + app->num_procs = (orte_std_cntr_t)orte_cmd_line.num_procs; + total_num_apps++; + + /* Capture any preload flags */ + if (orte_cmd_line.preload_binaries) { + orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); + } + /* if we were told to cwd to the session dir and the app was given in + * relative syntax, then we need to preload the binary to + * find the app - don't do this for java apps, however, as we + * can't easily find the class on the cmd line. Java apps have to + * preload their binary via the preload_files option + */ + if (!opal_path_is_absolute(app->argv[0]) && + NULL == strstr(app->argv[0], "java")) { + if (orte_cmd_line.preload_binaries) { + orte_set_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + } else if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { + orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_BIN, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); + } + } + if (NULL != orte_cmd_line.preload_files) { + orte_set_attribute(&app->attributes, ORTE_APP_PRELOAD_FILES, ORTE_ATTR_LOCAL, + orte_cmd_line.preload_files, OPAL_STRING); + } + +#if OPAL_ENABLE_FT_CR == 1 + if(NULL != orte_cmd_line.sstore_load) { + orte_set_attribute(&app->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, + orte_cmd_line.sstore_load, OPAL_STRING); + } +#endif + + /* Do not try to find argv[0] here -- the starter is responsible + for that because it may not be relevant to try to find it on + the node where orterun is executing. So just strdup() argv[0] + into app. */ + + app->app = strdup(app->argv[0]); + if (NULL == app->app) { + orte_show_help("help-orterun.txt", "orterun:call-failed", + true, orte_basename, "library", "strdup returned NULL", errno); + rc = ORTE_ERR_NOT_FOUND; + goto cleanup; + } + + /* if this is a Java application, we have a bit more work to do. Such + * applications actually need to be run under the Java virtual machine + * and the "java" command will start the "executable". So we need to ensure + * that all the proper java-specific paths are provided + */ + appname = opal_basename(app->app); + if (0 == strcmp(appname, "java")) { + /* see if we were given a library path */ + found = false; + for (i=1; NULL != app->argv[i]; i++) { + if (NULL != strstr(app->argv[i], "java.library.path")) { + /* yep - but does it include the path to the mpi libs? */ + found = true; + if (NULL == strstr(app->argv[i], opal_install_dirs.libdir)) { + /* doesn't appear to - add it to be safe */ + if (':' == app->argv[i][strlen(app->argv[i]-1)]) { + asprintf(&value, "-Djava.library.path=%s%s", app->argv[i], opal_install_dirs.libdir); + } else { + asprintf(&value, "-Djava.library.path=%s:%s", app->argv[i], opal_install_dirs.libdir); + } + free(app->argv[i]); + app->argv[i] = value; + } + break; + } + } + if (!found) { + /* need to add it right after the java command */ + asprintf(&value, "-Djava.library.path=%s", opal_install_dirs.libdir); + opal_argv_insert_element(&app->argv, 1, value); + free(value); + } + + /* see if we were given a class path */ + found = false; + for (i=1; NULL != app->argv[i]; i++) { + if (NULL != strstr(app->argv[i], "cp") || + NULL != strstr(app->argv[i], "classpath")) { + /* yep - but does it include the path to the mpi libs? */ + found = true; + /* check if mpi.jar exists - if so, add it */ + value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, i+1, "mpi.jar"); + } + free(value); + /* check for oshmem support */ + value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, i+1, "shmem.jar"); + } + free(value); + /* always add the local directory */ + asprintf(&value, "%s:%s", app->cwd, app->argv[i+1]); + free(app->argv[i+1]); + app->argv[i+1] = value; + break; + } + } + if (!found) { + /* check to see if CLASSPATH is in the environment */ + found = false; // just to be pedantic + for (i=0; NULL != environ[i]; i++) { + if (0 == strncmp(environ[i], "CLASSPATH", strlen("CLASSPATH"))) { + value = strchr(environ[i], '='); + ++value; /* step over the = */ + opal_argv_insert_element(&app->argv, 1, value); + /* check for mpi.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, 1, "mpi.jar"); + } + free(value); + /* check for shmem.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); + if (access(value, F_OK ) != -1) { + set_classpath_jar_file(app, 1, "shmem.jar"); + } + free(value); + /* always add the local directory */ + (void)asprintf(&value, "%s:%s", app->cwd, app->argv[1]); + free(app->argv[1]); + app->argv[1] = value; + opal_argv_insert_element(&app->argv, 1, "-cp"); + found = true; + break; + } + } + if (!found) { + /* need to add it right after the java command - have + * to include the working directory and trust that + * the user set cwd if necessary + */ + char *str, *str2; + /* always start with the working directory */ + str = strdup(app->cwd); + /* check for mpi.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "mpi.jar", NULL); + if (access(value, F_OK ) != -1) { + (void)asprintf(&str2, "%s:%s", str, value); + free(str); + str = str2; + } + free(value); + /* check for shmem.jar */ + value = opal_os_path(false, opal_install_dirs.libdir, "shmem.jar", NULL); + if (access(value, F_OK ) != -1) { + asprintf(&str2, "%s:%s", str, value); + free(str); + str = str2; + } + free(value); + opal_argv_insert_element(&app->argv, 1, str); + free(str); + opal_argv_insert_element(&app->argv, 1, "-cp"); + } + } + /* try to find the actual command - may not be perfect */ + for (i=1; i < opal_argv_count(app->argv); i++) { + if (NULL != strstr(app->argv[i], "java.library.path")) { + continue; + } else if (NULL != strstr(app->argv[i], "cp") || + NULL != strstr(app->argv[i], "classpath")) { + /* skip the next field */ + i++; + continue; + } + /* declare this the winner */ + opal_setenv("OMPI_COMMAND", app->argv[i], true, &app->env); + /* collect everything else as the cmd line */ + if ((i+1) < opal_argv_count(app->argv)) { + value = opal_argv_join(&app->argv[i+1], ' '); + opal_setenv("OMPI_ARGV", value, true, &app->env); + free(value); + } + break; + } + } else { + /* add the cmd to the environment for MPI_Info to pickup */ + opal_setenv("OMPI_COMMAND", appname, true, &app->env); + if (1 < opal_argv_count(app->argv)) { + value = opal_argv_join(&app->argv[1], ' '); + opal_setenv("OMPI_ARGV", value, true, &app->env); + free(value); + } + } + free(appname); + + *app_ptr = app; + app = NULL; + *made_app = true; + + /* All done */ + + cleanup: + if (NULL != app) { + OBJ_RELEASE(app); + } + if (cmd_line_made) { + OBJ_DESTRUCT(&cmd_line); + } + return rc; +} + +static void set_classpath_jar_file(orte_app_context_t *app, int index, char *jarfile) +{ + if (NULL == strstr(app->argv[index], jarfile)) { + /* nope - need to add it */ + char *fmt = ':' == app->argv[index][strlen(app->argv[index]-1)] + ? "%s%s/%s" : "%s:%s/%s"; + char *str; + asprintf(&str, fmt, app->argv[index], opal_install_dirs.libdir, jarfile); + free(app->argv[index]); + app->argv[index] = str; + } +} + +static int parse_appfile(orte_job_t *jdata, char *filename, char ***env) +{ + size_t i, len; + FILE *fp; + char line[BUFSIZ]; + int rc, argc, app_num; + orte_app_context_t *app; + bool blank, made_app; + char bogus[] = "bogus "; + char **tmp_env; + + /* + * Make sure to clear out this variable so we don't do anything odd in + * app_create() + */ + if( NULL != orte_cmd_line.appfile ) { + free( orte_cmd_line.appfile ); + orte_cmd_line.appfile = NULL; + } + + /* Try to open the file */ + + fp = fopen(filename, "r"); + if (NULL == fp) { + orte_show_help("help-orterun.txt", "orterun:appfile-not-found", true, + filename); + return ORTE_ERR_NOT_FOUND; + } + + /* Read in line by line */ + + line[sizeof(line) - 1] = '\0'; + app_num = 0; + do { + char **argv; + + /* We need a bogus argv[0] (because when argv comes in from + the command line, argv[0] is "orterun", so the parsing + logic ignores it). So create one here rather than making + an argv and then pre-pending a new argv[0] (which would be + rather inefficient). */ + + line[0] = '\0'; + strcat(line, bogus); + + if (NULL == fgets(line + sizeof(bogus) - 1, + sizeof(line) - sizeof(bogus) - 1, fp)) { + break; + } + + /* Remove a trailing newline */ + + len = strlen(line); + if (len > 0 && '\n' == line[len - 1]) { + line[len - 1] = '\0'; + if (len > 0) { + --len; + } + } + + /* Remove comments */ + + for (i = 0; i < len; ++i) { + if ('#' == line[i]) { + line[i] = '\0'; + break; + } else if (i + 1 < len && '/' == line[i] && '/' == line[i + 1]) { + line[i] = '\0'; + break; + } + } + + /* Is this a blank line? */ + + len = strlen(line); + for (blank = true, i = sizeof(bogus); i < len; ++i) { + if (!isspace(line[i])) { + blank = false; + break; + } + } + if (blank) { + continue; + } + + /* We got a line with *something* on it. So process it */ + + argv = opal_argv_split(line, ' '); + argc = opal_argv_count(argv); + if (argc > 0) { + + /* Create a temporary env to use in the recursive call -- + that is: don't disturb the original env so that we can + have a consistent global env. This allows for the + case: + + orterun --mca foo bar --appfile file + + where the "file" contains multiple apps. In this case, + each app in "file" will get *only* foo=bar as the base + environment from which its specific environment is + constructed. */ + + if (NULL != *env) { + tmp_env = opal_argv_copy(*env); + if (NULL == tmp_env) { + opal_argv_free(argv); + fclose(fp); + return ORTE_ERR_OUT_OF_RESOURCE; + } + } else { + tmp_env = NULL; + } + + rc = create_app(argc, argv, jdata, &app, &made_app, &tmp_env); + if (ORTE_SUCCESS != rc) { + /* Assume that the error message has already been + printed; no need to cleanup -- we can just exit */ + exit(1); + } + if (NULL != tmp_env) { + opal_argv_free(tmp_env); + } + opal_argv_free(argv); + if (made_app) { + app->idx = app_num; + ++app_num; + opal_pointer_array_add(jdata->apps, app); + ++jdata->num_apps; + } + } + } while (!feof(fp)); + fclose(fp); + + /* All done */ + + free(filename); + return ORTE_SUCCESS; +} +/* + * Process one line from the orte_base_user_debugger MCA param and + * look for that debugger in the path. If we find it, fill in + * new_argv. + */ +static int process(char *orig_line, char *basename, opal_cmd_line_t *cmd_line, + int argc, char **argv, char ***new_argv, int num_procs) +{ + int ret = ORTE_SUCCESS; + int i, j, count; + char *line = NULL, *tmp = NULL, *full_line = strdup(orig_line); + char **orterun_argv = NULL, **executable_argv = NULL, **line_argv = NULL; + char cwd[OPAL_PATH_MAX]; + bool used_num_procs = false; + bool single_app = false; + bool fail_needed_executable = false; + + line = full_line; + if (NULL == line) { + ret = ORTE_ERR_OUT_OF_RESOURCE; + goto out; + } + + /* Trim off whitespace at the beginning and ending of line */ + + for (i = 0; '\0' != line[i] && isspace(line[i]); ++line) { + continue; + } + for (i = strlen(line) - 2; i > 0 && isspace(line[i]); ++i) { + line[i] = '\0'; + } + if (strlen(line) <= 0) { + ret = ORTE_ERROR; + goto out; + } + + /* Get the tail of the command line (i.e., the user executable / + argv) */ + + opal_cmd_line_get_tail(cmd_line, &i, &executable_argv); + + /* Make a new copy of the orterun command line args, without the + orterun token itself, and without the --debug, --debugger, and + -tv flags. */ + + orterun_argv = opal_argv_copy(argv); + count = opal_argv_count(orterun_argv); + opal_argv_delete(&count, &orterun_argv, 0, 1); + for (i = 0; NULL != orterun_argv[i]; ++i) { + count = opal_argv_count(orterun_argv); + if (0 == strcmp(orterun_argv[i], "-debug") || + 0 == strcmp(orterun_argv[i], "--debug")) { + opal_argv_delete(&count, &orterun_argv, i, 1); + } else if (0 == strcmp(orterun_argv[i], "-tv") || + 0 == strcmp(orterun_argv[i], "--tv")) { + opal_argv_delete(&count, &orterun_argv, i, 1); + } else if (0 == strcmp(orterun_argv[i], "--debugger") || + 0 == strcmp(orterun_argv[i], "-debugger")) { + opal_argv_delete(&count, &orterun_argv, i, 2); + } + } + + /* Replace @@ tokens - line should never realistically be bigger + than MAX_INT, so just cast to int to remove compiler warning */ + + *new_argv = NULL; + line_argv = opal_argv_split(line, ' '); + if (NULL == line_argv) { + ret = ORTE_ERR_NOT_FOUND; + goto out; + } + for (i = 0; NULL != line_argv[i]; ++i) { + if (0 == strcmp(line_argv[i], "@mpirun@") || + 0 == strcmp(line_argv[i], "@orterun@")) { + opal_argv_append_nosize(new_argv, argv[0]); + } else if (0 == strcmp(line_argv[i], "@mpirun_args@") || + 0 == strcmp(line_argv[i], "@orterun_args@")) { + for (j = 0; NULL != orterun_argv && NULL != orterun_argv[j]; ++j) { + opal_argv_append_nosize(new_argv, orterun_argv[j]); + } + } else if (0 == strcmp(line_argv[i], "@np@")) { + used_num_procs = true; + asprintf(&tmp, "%d", num_procs); + opal_argv_append_nosize(new_argv, tmp); + free(tmp); + } else if (0 == strcmp(line_argv[i], "@single_app@")) { + /* This token is only a flag; it is not replaced with any + alternate text */ + single_app = true; + } else if (0 == strcmp(line_argv[i], "@executable@")) { + /* If we found the executable, paste it in. Otherwise, + this is a possible error. */ + if (NULL != executable_argv) { + opal_argv_append_nosize(new_argv, executable_argv[0]); + } else { + fail_needed_executable = true; + } + } else if (0 == strcmp(line_argv[i], "@executable_argv@")) { + /* If we found the tail, paste in the argv. Otherwise, + this is a possible error. */ + if (NULL != executable_argv) { + for (j = 1; NULL != executable_argv[j]; ++j) { + opal_argv_append_nosize(new_argv, executable_argv[j]); + } + } else { + fail_needed_executable = true; + } + } else { + /* It wasn't a special token, so just copy it over */ + opal_argv_append_nosize(new_argv, line_argv[i]); + } + } + + /* Can we find argv[0] in the path? */ + + getcwd(cwd, OPAL_PATH_MAX); + tmp = opal_path_findv((*new_argv)[0], X_OK, environ, cwd); + if (NULL != tmp) { + free(tmp); + + /* Ok, we found a good debugger. Check for some error + conditions. */ + tmp = opal_argv_join(argv, ' '); + + /* We do not support launching a debugger that requires the + -np value if the user did not specify -np on the command + line. */ + if (used_num_procs && 0 == num_procs) { + free(tmp); + tmp = opal_argv_join(orterun_argv, ' '); + orte_show_help("help-orterun.txt", "debugger requires -np", + true, (*new_argv)[0], argv[0], tmp, + (*new_argv)[0]); + /* Fall through to free / fail, below */ + } + + /* Some debuggers do not support launching MPMD */ + else if (single_app && NULL != strstr(tmp, " : ")) { + orte_show_help("help-orterun.txt", + "debugger only accepts single app", true, + (*new_argv)[0], (*new_argv)[0]); + /* Fall through to free / fail, below */ + } + + /* Some debuggers do not use orterun/mpirun, and therefore + must have an executable to run (e.g., cannot use mpirun's + app context file feature). */ + else if (fail_needed_executable) { + orte_show_help("help-orterun.txt", + "debugger requires executable", true, + (*new_argv)[0], argv[0], (*new_argv)[0], argv[0], + (*new_argv)[0]); + /* Fall through to free / fail, below */ + } + + /* Otherwise, we succeeded. Return happiness. */ + else { + goto out; + } + } + + /* All done -- didn't find it */ + + opal_argv_free(*new_argv); + *new_argv = NULL; + ret = ORTE_ERR_NOT_FOUND; + + out: + if (NULL != orterun_argv) { + opal_argv_free(orterun_argv); + } + if (NULL != executable_argv) { + opal_argv_free(executable_argv); + } + if (NULL != line_argv) { + opal_argv_free(line_argv); + } + if (NULL != tmp) { + free(tmp); + } + if (NULL != full_line) { + free(full_line); + } + return ret; +} + +/** + * Run a user-level debugger + */ +static void run_debugger(char *basename, opal_cmd_line_t *cmd_line, + int argc, char *argv[], int num_procs) +{ + int i, id, ret; + char **new_argv = NULL; + const char **tmp; + char *value, **lines, *env_name; + + /* Get the orte_base_debug MCA parameter and search for a debugger + that can run */ + + id = mca_base_var_find("orte", "orte", NULL, "base_user_debugger"); + if (id < 0) { + orte_show_help("help-orterun.txt", "debugger-mca-param-not-found", + true); + exit(1); + } + + ret = mca_base_var_get_value (id, &tmp, NULL, NULL); + if (OPAL_SUCCESS != ret || NULL == tmp || NULL == tmp[0]) { + orte_show_help("help-orterun.txt", "debugger-orte_base_user_debugger-empty", + true); + exit(1); + } + + /* Look through all the values in the MCA param */ + + lines = opal_argv_split(tmp[0], ':'); + for (i = 0; NULL != lines[i]; ++i) { + if (ORTE_SUCCESS == process(lines[i], basename, cmd_line, argc, argv, + &new_argv, num_procs)) { + break; + } + } + + /* If we didn't find one, abort */ + + if (NULL == lines[i]) { + orte_show_help("help-orterun.txt", "debugger-not-found", true); + exit(1); + } + opal_argv_free(lines); + + /* We found one */ + + /* cleanup the MPIR arrays in case the debugger doesn't set them */ + memset((char*)MPIR_executable_path, 0, MPIR_MAX_PATH_LENGTH); + memset((char*)MPIR_server_arguments, 0, MPIR_MAX_ARG_LENGTH); + + /* Set an MCA param so that everyone knows that they are being + launched under a debugger; not all debuggers are consistent + about setting MPIR_being_debugged in both the launcher and the + MPI processes */ + ret = mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); + if (OPAL_SUCCESS == ret && NULL != env_name) { + opal_setenv(env_name, "1", true, &environ); + free(env_name); + } + + /* Launch the debugger */ + execvp(new_argv[0], new_argv); + value = opal_argv_join(new_argv, ' '); + orte_show_help("help-orterun.txt", "debugger-exec-failed", + true, basename, value, new_argv[0]); + free(value); + opal_argv_free(new_argv); + exit(1); +} + +/**** DEBUGGER CODE ****/ +/* + * Debugger support for orterun + * + * We interpret the MPICH debugger interface as follows: + * + * a) The launcher + * - spawns the other processes, + * - fills in the table MPIR_proctable, and sets MPIR_proctable_size + * - sets MPIR_debug_state to MPIR_DEBUG_SPAWNED ( = 1) + * - calls MPIR_Breakpoint() which the debugger will have a + * breakpoint on. + * + * b) Applications start and then spin until MPIR_debug_gate is set + * non-zero by the debugger. + * + * This file implements (a). + * + ************************************************************************** + * + * Note that we have presently tested both TotalView and DDT parallel + * debuggers. They both nominally subscribe to the Etnus attaching + * interface, but there are differences between the two. + * + * TotalView: user launches "totalview mpirun -a ......". + * TV launches mpirun. mpirun launches the application and then calls + * MPIR_Breakpoint(). This is the signal to TV that it's a parallel + * MPI job. TV then reads the proctable in mpirun and attaches itself + * to all the processes (it takes care of launching itself on the + * remote nodes). Upon attaching to all the MPI processes, the + * variable MPIR_being_debugged is set to 1. When it has finished + * attaching itself to all the MPI processes that it wants to, + * MPIR_Breakpoint() returns. + * + * DDT: user launches "ddt bin -np X ". DDT fork/exec's + * mpirun to launch ddt-debugger on the back-end nodes via "mpirun -np + * X ddt-debugger" (not the lack of other arguments -- we can't pass + * anything to mpirun). This app will eventually fork/exec the MPI + * app. DDT does not current set MPIR_being_debugged in the MPI app. + * + ************************************************************************** + * + * We support two ways of waiting for attaching debuggers. The + * implementation spans this file and ompi/debuggers/ompi_debuggers.c. + * + * 1. If using orterun: MPI processes will have the + * orte_in_parallel_debugger MCA param set to true (because not all + * debuggers consistently set MPIR_being_debugged in both the launcher + * and in the MPI procs). The HNP will call MPIR_Breakpoint() and + * then RML send a message to VPID 0 (MCW rank 0) when it returns + * (MPIR_Breakpoint() doesn't return until the debugger has attached + * to all relevant processes). Meanwhile, VPID 0 blocks waiting for + * the RML message. All other VPIDs immediately call the grpcomm + * barrier (and therefore block until the debugger attaches). Once + * VPID 0 receives the RML message, we know that the debugger has + * attached to all processes that it cares about, and VPID 0 then + * joins the grpcomm barrier, allowing the job to continue. This + * scheme has the side effect of nicely supporting partial attaches by + * parallel debuggers (i.e., attaching to only some of the MPI + * processes; not necessarily all of them). + * + * 2. If not using orterun: in this case, we know that there will not be an RML message + * sent to VPID 0. So we have to look for a magic environment + * variable from the launcher to know if the jobs will be attached by + * a debugger (e.g., set by yod, srun, ...etc.), and if so, spin on + * MPIR_debug_gate. These environment variable names must be + * hard-coded in the OMPI layer (see ompi/debuggers/ompi_debuggers.c). + */ + +/* local globals and functions */ +#define DUMP_INT(X) fprintf(stderr, " %s = %d\n", # X, X); +#define FILE_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) + +struct MPIR_PROCDESC { + char *host_name; /* something that can be passed to inet_addr */ + char *executable_name; /* name of binary */ + int pid; /* process pid */ +}; + + +static void orte_debugger_dump(void) +{ + int i; + + DUMP_INT(MPIR_being_debugged); + DUMP_INT(MPIR_debug_state); + DUMP_INT(MPIR_partial_attach_ok); + DUMP_INT(MPIR_i_am_starter); + DUMP_INT(MPIR_forward_output); + DUMP_INT(MPIR_proctable_size); + fprintf(stderr, " MPIR_proctable:\n"); + for (i = 0; i < MPIR_proctable_size; i++) { + fprintf(stderr, + " (i, host, exe, pid) = (%d, %s, %s, %d)\n", + i, + MPIR_proctable[i].host_name, + MPIR_proctable[i].executable_name, + MPIR_proctable[i].pid); + } + fprintf(stderr, "MPIR_executable_path: %s\n", + ('\0' == MPIR_executable_path[0]) ? + "NULL" : (char*) MPIR_executable_path); + fprintf(stderr, "MPIR_server_arguments: %s\n", + ('\0' == MPIR_server_arguments[0]) ? + "NULL" : (char*) MPIR_server_arguments); +} + +/** + * Initialization of data structures for running under a debugger + * using the MPICH/TotalView parallel debugger interface. Before the + * spawn we need to check if we are being run under a TotalView-like + * debugger; if so then inform applications via an MCA parameter. + */ +static void orte_debugger_init_before_spawn(orte_job_t *jdata) +{ + char *env_name; + orte_app_context_t *app; + int i; + char *attach_fifo; + + if (!MPIR_being_debugged && !orte_in_parallel_debugger) { + /* if we were given a test debugger, then we still want to + * colaunch it + */ + if (NULL != orte_debugger_test_daemon) { + opal_output_verbose(2, orte_debug_output, + "%s No debugger test daemon specified", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + goto launchit; + } + /* if we were given an auto-detect rate, then we want to setup + * an event so we periodically do the check + */ + if (0 < orte_debugger_check_rate) { + opal_output_verbose(2, orte_debug_output, + "%s Setting debugger attach check rate for %d seconds", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_debugger_check_rate); + ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, attach_debugger, ORTE_SYS_PRI); + } else if (orte_create_session_dirs) { + /* create the attachment FIFO and setup readevent - cannot be + * done if no session dirs exist! + */ + attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL); + if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) { + opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno); + free(attach_fifo); + return; + } + strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1); + free(attach_fifo); + open_fifo(); + } + return; + } + + launchit: + opal_output_verbose(1, orte_debug_output, "Info: Spawned by a debugger"); + + /* tell the procs they are being debugged */ + (void) mca_base_var_env_name ("orte_in_parallel_debugger", &env_name); + + for (i=0; i < jdata->apps->size; i++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { + continue; + } + opal_setenv(env_name, "1", true, &app->env); + } + free(env_name); +} + +static void setup_debugger_job(void) +{ + orte_job_t *debugger; + orte_app_context_t *app; + orte_proc_t *proc; + int i, rc; + orte_node_t *node; + orte_vpid_t vpid=0; + char cwd[OPAL_PATH_MAX]; + + /* setup debugger daemon job */ + debugger = OBJ_NEW(orte_job_t); + /* create a jobid for these daemons - this is done solely + * to avoid confusing the rest of the system's bookkeeping + */ + orte_plm_base_create_jobid(debugger); + /* set the personality to ORTE */ + opal_argv_append_nosize(&debugger->personality, "orte"); + /* flag the job as being debugger daemons */ + ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_DEBUGGER_DAEMON); + /* unless directed, we do not forward output */ + if (!MPIR_forward_output) { + ORTE_FLAG_SET(debugger, ORTE_JOB_FLAG_FORWARD_OUTPUT); + } + /* dont push stdin */ + debugger->stdin_target = ORTE_VPID_INVALID; + /* add it to the global job pool */ + opal_hash_table_set_value_uint32(orte_job_data, debugger->jobid, debugger); + /* create an app_context for the debugger daemon */ + app = OBJ_NEW(orte_app_context_t); + if (NULL != orte_debugger_test_daemon) { + app->app = strdup(orte_debugger_test_daemon); + } else { + app->app = strdup((char*)MPIR_executable_path); + } + /* don't currently have an option to pass the debugger + * cwd - probably should add one someday + */ + if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { + orte_show_help("help-orterun.txt", "orterun:init-failure", + true, "get the cwd", rc); + return; + } + app->cwd = strdup(cwd); + orte_remove_attribute(&app->attributes, ORTE_APP_USER_CWD); + opal_argv_append_nosize(&app->argv, app->app); + build_debugger_args(app); + opal_pointer_array_add(debugger->apps, app); + debugger->num_apps = 1; + /* create a job map */ + debugger->map = OBJ_NEW(orte_job_map_t); + /* in building the map, we want to launch one debugger daemon + * on each node that *already has an application process on it*. + * We cannot just launch one debugger daemon on EVERY node because + * the original job may not have placed procs on every node. So + * we construct the map here by cycling across all nodes, adding + * only those nodes where num_procs > 0. + */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + /* if this node wasn't included in the vm, ignore it */ + if (NULL == node->daemon) { + continue; + } + /* if the node doesn't have any app procs on it, ignore it */ + if (node->num_procs < 1) { + continue; + } + /* this node has at least one proc, so add it to our map */ + OBJ_RETAIN(node); + opal_pointer_array_add(debugger->map->nodes, node); + debugger->map->num_nodes++; + /* add a debugger daemon to the node - note that the + * debugger daemon does NOT count against our subscribed slots + */ + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = debugger->jobid; + proc->name.vpid = vpid++; + /* set the local/node ranks - we don't actually care + * what these are, but the odls needs them + */ + proc->local_rank = 0; + proc->node_rank = 0; + proc->app_rank = proc->name.vpid; + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + proc->app_idx = 0; + + OBJ_RETAIN(node); /* maintain accounting on object */ + proc->node = node; + /* add the proc to the job */ + opal_pointer_array_set_item(debugger->procs, proc->name.vpid, proc); + debugger->num_procs++; + + /* add the proc to the node's array */ + OBJ_RETAIN(proc); + opal_pointer_array_add(node->procs, (void*)proc); + node->num_procs++; + } + /* schedule it for launch */ + debugger->state = ORTE_JOB_STATE_INIT; + ORTE_ACTIVATE_JOB_STATE(debugger, ORTE_JOB_STATE_LAUNCH_APPS); +} + +static bool mpir_breakpoint_fired = false; + +/* + * Initialization of data structures for running under a debugger + * using the MPICH/TotalView parallel debugger interface. This stage + * of initialization must occur after spawn + * + * NOTE: We -always- perform this step to ensure that any debugger + * that attaches to us post-launch of the application can get a + * completed proctable + */ +void orte_debugger_init_after_spawn(int fd, short event, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; + orte_proc_t *proc; + orte_app_context_t *appctx; + orte_vpid_t i, j; + opal_buffer_t *buf; + int rc; + char **aliases, *aptr; + + /* if we couldn't get thru the mapper stage, we might + * enter here with no procs. Avoid the "zero byte malloc" + * message by checking here + */ + if (MPIR_proctable || 0 == jdata->num_procs) { + /* already initialized */ + opal_output_verbose(5, orte_debug_output, + "%s: debugger already initialized or zero procs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + OBJ_RELEASE(caddy); + if (!mpir_breakpoint_fired) { + /* record that we have triggered the debugger */ + mpir_breakpoint_fired = true; + + /* trigger the debugger */ + MPIR_Breakpoint(); + + /* send a message to rank=0 to release it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || + ORTE_PROC_STATE_UNTERMINATED < proc->state ) { + /* proc is already dead */ + return; + } + buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ + if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, + ORTE_RML_TAG_DEBUGGER_RELEASE, + orte_rml_send_callback, NULL))) { + opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); + OBJ_RELEASE(buf); + } + } + return; + } + + /* fill in the proc table for the application processes */ + + opal_output_verbose(5, orte_debug_output, + "%s: Setting up debugger process table for applications", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + MPIR_debug_state = 1; + + /* set the total number of processes in the job */ + MPIR_proctable_size = jdata->num_procs; + + /* allocate MPIR_proctable */ + MPIR_proctable = (struct MPIR_PROCDESC *)malloc(sizeof(struct MPIR_PROCDESC) * + MPIR_proctable_size); + if (MPIR_proctable == NULL) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + OBJ_RELEASE(caddy); + return; + } + + if (orte_debugger_dump_proctable) { + opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); + } + + /* initialize MPIR_proctable */ + for (j=0; j < jdata->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { + continue; + } + /* store this data in the location whose index + * corresponds to the proc's rank + */ + i = proc->name.vpid; + if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { + continue; + } + + /* take the indicated alias as the hostname, if aliases exist */ + if (orte_retain_aliases) { + aliases = NULL; + aptr = NULL; + if (orte_get_attribute(&proc->node->attributes, ORTE_NODE_ALIAS, (void**)&aptr, OPAL_STRING)) { + aliases = opal_argv_split(aptr, ','); + free(aptr); + if (orte_use_hostname_alias <= opal_argv_count(aliases)) { + MPIR_proctable[i].host_name = strdup(aliases[orte_use_hostname_alias-1]); + } + opal_argv_free(aliases); + } + } else { + /* just use the default name */ + MPIR_proctable[i].host_name = strdup(proc->node->name); + } + + if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { + MPIR_proctable[i].executable_name = + opal_os_path( false, appctx->app, NULL ); + } else { + MPIR_proctable[i].executable_name = + opal_os_path( false, appctx->cwd, appctx->app, NULL ); + } + MPIR_proctable[i].pid = proc->pid; + if (orte_debugger_dump_proctable) { + opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", + ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, + MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); + } + } + + if (0 < opal_output_get_verbosity(orte_debug_output)) { + orte_debugger_dump(); + } + + /* if we are being launched under a debugger, then we must wait + * for it to be ready to go and do some things to start the job + */ + if (MPIR_being_debugged || NULL != orte_debugger_test_daemon || + NULL != getenv("ORTE_TEST_DEBUGGER_ATTACH")) { + /* if we are not launching debugger daemons, then trigger + * the debugger - otherwise, we need to wait for the debugger + * daemons to be started + */ + if ('\0' == MPIR_executable_path[0] && NULL == orte_debugger_test_daemon) { + /* record that we have triggered the debugger */ + mpir_breakpoint_fired = true; + + /* trigger the debugger */ + MPIR_Breakpoint(); + + /* send a message to rank=0 to release it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, 0)) || + ORTE_PROC_STATE_UNTERMINATED < proc->state) { + /* proc is already dead or never registered with us (so we don't have + * contact info for him) + */ + return; + } + opal_output_verbose(2, orte_debug_output, + "%s sending debugger release to %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name)); + buf = OBJ_NEW(opal_buffer_t); /* don't need anything in this */ + if (0 > (rc = orte_rml.send_buffer_nb(&proc->name, buf, + ORTE_RML_TAG_DEBUGGER_RELEASE, + orte_rml_send_callback, NULL))) { + opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); + OBJ_RELEASE(buf); + } + } else { + /* if I am launching debugger daemons, then I need to do so now + * that the job has been started and I know which nodes have + * apps on them + */ + opal_output_verbose(2, orte_debug_output, + "%s Cospawning debugger daemons %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == orte_debugger_test_daemon) ? + MPIR_executable_path : orte_debugger_test_daemon); + setup_debugger_job(); + } + /* we don't have anything else to do */ + OBJ_RELEASE(caddy); + return; + } + + /* if we are not being debugged, then just cleanup and depart */ + OBJ_RELEASE(caddy); +} + +static void orte_debugger_detached(int fd, short event, void *cbdata) +{ + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + OBJ_RELEASE(caddy); + + /* need to ensure MPIR_Breakpoint is called again if another debugger attaches */ + mpir_breakpoint_fired = false; +} + +static void open_fifo (void) +{ + if (attach_fd > 0) { + close(attach_fd); + } + + attach_fd = open(MPIR_attach_fifo, O_RDONLY | O_NONBLOCK, 0); + if (attach_fd < 0) { + opal_output(0, "%s unable to open debugger attach fifo", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; + } + + /* Set this fd to be close-on-exec so that children don't see it */ + if (opal_fd_set_cloexec(attach_fd) != OPAL_SUCCESS) { + opal_output(0, "%s unable to set debugger attach fifo to CLOEXEC", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + close(attach_fd); + attach_fd = -1; + return; + } + + opal_output_verbose(2, orte_debug_output, + "%s Monitoring debugger attach fifo %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + MPIR_attach_fifo); + attach = (opal_event_t*)malloc(sizeof(opal_event_t)); + opal_event_set(orte_event_base, attach, attach_fd, OPAL_EV_READ, attach_debugger, attach); + + fifo_active = true; + opal_event_add(attach, 0); +} + +static void attach_debugger(int fd, short event, void *arg) +{ + unsigned char fifo_cmd; + int rc; + orte_timer_t *tm; + opal_event_t *attach; + + if (fifo_active) { + attach = (opal_event_t*)arg; + fifo_active = false; + + rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd)); + if (!rc) { + /* release the current event */ + opal_event_free(attach); + /* reopen device to clear hangup */ + open_fifo(); + return; + } + if (1 != fifo_cmd) { + /* ignore the cmd */ + fifo_active = true; + opal_event_add(attach, 0); + return; + } + } + + if (!MPIR_being_debugged && !orte_debugger_test_attach) { + /* false alarm - reset the read or timer event */ + if (0 == orte_debugger_check_rate) { + fifo_active = true; + opal_event_add(attach, 0); + } else if (!MPIR_being_debugged) { + tm = (orte_timer_t*)arg; + /* re-add the event */ + opal_event_evtimer_add(tm->ev, &tm->tv); + } + return; + } + + opal_output_verbose(1, orte_debug_output, + "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == orte_debugger_test_daemon) ? MPIR_executable_path : orte_debugger_test_daemon); + + /* a debugger has attached! All the MPIR_Proctable + * data is already available, so we only need to + * check to see if we should spawn any daemons + */ + if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_test_daemon) { + opal_output_verbose(2, orte_debug_output, + "%s Spawning debugger daemons %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == orte_debugger_test_daemon) ? + MPIR_executable_path : orte_debugger_test_daemon); + setup_debugger_job(); + } + + /* reset the read or timer event */ + if (0 == orte_debugger_check_rate) { + fifo_active = true; + opal_event_add(attach, 0); + } else if (!MPIR_being_debugged) { + tm = (orte_timer_t*)arg; + /* re-add the event */ + opal_event_evtimer_add(tm->ev, &tm->tv); + } +} + +static void build_debugger_args(orte_app_context_t *debugger) +{ + int i, j; + char mpir_arg[MPIR_MAX_ARG_LENGTH]; + + if ('\0' != MPIR_server_arguments[0]) { + j=0; + memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); + for (i=0; i < MPIR_MAX_ARG_LENGTH; i++) { + if (MPIR_server_arguments[i] == '\0') { + if (0 < j) { + opal_argv_append_nosize(&debugger->argv, mpir_arg); + memset(mpir_arg, 0, MPIR_MAX_ARG_LENGTH); + j=0; + } + } else { + mpir_arg[j] = MPIR_server_arguments[i]; + j++; + } + } + } +} + +void orte_timeout_wakeup(int sd, short args, void *cbdata) +{ + char *tm; + + /* this function gets called when the job execution time + * has hit a prescribed limit - so just abort + */ + tm = getenv("MPIEXEC_TIMEOUT"); + orte_show_help("help-orterun.txt", "orterun:timeout", + true, (NULL == tm) ? "NULL" : tm); + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + /* if we are testing HNP suicide, then just exit */ + if (NULL != getenv("ORTE_TEST_HNP_SUICIDE")) { + opal_output(0, "HNP exiting w/o cleanup"); + exit(1); + } + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); + /* set the global abnormal exit flag */ + orte_abnormal_term_ordered = true; +} diff --git a/orte/util/Makefile.am b/orte/util/Makefile.am index 6873f08328c..e0af75ecc86 100644 --- a/orte/util/Makefile.am +++ b/orte/util/Makefile.am @@ -11,7 +11,7 @@ # All rights reserved. # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2014-2016 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -57,8 +57,7 @@ headers += \ util/nidmap.h \ util/regex.h \ util/attr.h \ - util/listener.h \ - util/cmd_line.h + util/listener.h lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \ util/error_strings.c \ @@ -77,8 +76,7 @@ lib@ORTE_LIB_PREFIX@open_rte_la_SOURCES += \ util/nidmap.c \ util/regex.c \ util/attr.c \ - util/listener.c \ - util/cmd_line.c + util/listener.c # Remove the generated man pages distclean-local: diff --git a/orte/util/cmd_line.c b/orte/util/cmd_line.c deleted file mode 100644 index cf5d755140c..00000000000 --- a/orte/util/cmd_line.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2016 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "orte_config.h" -#include "orte/types.h" -#include "orte/constants.h" - -#include -#include - -#include "opal/mca/base/base.h" -#include "opal/util/cmd_line.h" -#include "opal/util/printf.h" -#include "opal/runtime/opal.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/schizo/base/base.h" - -#include "orte/util/cmd_line.h" - -int orte_cmd_line_create(opal_cmd_line_t *cmd_line, - int argc, char **argv, - char ***context_env, char ***global_env, - bool *version, bool *help) -{ - int i, rc; - - if (NULL != version) { - *version = false; - } - if (NULL != help) { - *help = false; - } - - if (NULL != version) { - /* see if they asked for us to print version */ - for (i=0; NULL != argv[i]; i++) { - if (0 == strcmp(argv[i], "--version") || - 0 == strcmp(argv[i], "-V")) { - *version = true; - return ORTE_SUCCESS; - } - } - } - - /* process any mca params */ - if (OPAL_SUCCESS != (rc = mca_base_cmd_line_process_args(argv, context_env, global_env))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - - opal_cmd_line_create(cmd_line, NULL); - - /* init the MCA system - will just refcount if already initialized */ - opal_init_util(NULL, NULL); - - /* open the SCHIZO framework so we can define the cmd line options */ - if (ORTE_SUCCESS != (rc = mca_base_framework_open(&orte_schizo_base_framework, 0))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (ORTE_SUCCESS != (rc = orte_schizo_base_select())) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* define the cli options */ - if (ORTE_SUCCESS != (rc = orte_schizo.define_cli(cmd_line))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* close the framework for bookkeeping purposes */ - mca_base_framework_close(&orte_schizo_base_framework); - - /* decrement the opal refcount */ - opal_finalize_util(); - - /* now that options have been defined, finish setup */ - mca_base_cmd_line_setup(cmd_line); - - - /* Check for help request - must do this after we setup - * the cmd line so the help messages can display */ - if (NULL != help) { - for (i=0; NULL != argv[i]; i++) { - if (0 == strcmp(argv[i], "--help") || - 0 == strcmp(argv[i], "-h")) { - *help = true; - return ORTE_SUCCESS; - } - } - } - - /* parse the result to get values */ - if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(cmd_line, true, - argc, argv)) ) { - if (OPAL_ERR_SILENT != rc) { - fprintf(stderr, "%s: command line error (%s)\n", argv[0], - opal_strerror(rc)); - } - return rc; - } - - return ORTE_SUCCESS; -} diff --git a/orte/util/cmd_line.h b/orte/util/cmd_line.h deleted file mode 100644 index 231fa1d96c9..00000000000 --- a/orte/util/cmd_line.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2016 Intel, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** @file: - * - * Populates global structure with system-specific information. - * - * Notes: add limits.h, compute size of integer and other types via sizeof(type)*CHAR_BIT - * - */ - -#ifndef _ORTE_CMD_LINE_H_ -#define _ORTE_CMD_LINE_H_ - -#include "orte_config.h" - -#ifdef HAVE_STDINT_h -#include -#endif - -#include "orte/types.h" - -#include "opal/util/cmd_line.h" - -BEGIN_C_DECLS - -ORTE_DECLSPEC int orte_cmd_line_create(opal_cmd_line_t *cmd_line, - int argc, char **argv, - char ***context_env, char ***global_env, - bool *version, bool *help); - -END_C_DECLS -#endif