diff --git a/.gitignore b/.gitignore index 36908c03f07..1228a7948ed 100644 --- a/.gitignore +++ b/.gitignore @@ -415,6 +415,7 @@ orte/test/mpi/memcached-dummy orte/test/mpi/coll_test orte/test/mpi/badcoll orte/test/mpi/iof +orte/test/mpi/no-disconnect orte/test/system/radix orte/test/system/sigusr_trap diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index bee99bd8062..cb9e4ccf43f 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Mellanox Technologies, Inc. @@ -118,6 +118,12 @@ static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) cd->active = false; } +static void opcbfunc(int status, void *cbdata) +{ + struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; + cd->active = false; +} + int opal_pmix_base_exchange(opal_value_t *indat, opal_pmix_pdata_t *outdat, int timeout) @@ -141,11 +147,29 @@ int opal_pmix_base_exchange(opal_value_t *indat, opal_list_append(&ilist, &info->super); /* publish it with "session" scope */ - rc = opal_pmix.publish(&ilist); - OPAL_LIST_DESTRUCT(&ilist); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; + if (NULL == opal_pmix.publish_nb) { + rc = opal_pmix.publish(&ilist); + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + return rc; + } + } else { + caddy.active = true; + rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + OPAL_LIST_DESTRUCT(&ilist); + return rc; + } + while (caddy.active) { + usleep(10); + } + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != caddy.status) { + OPAL_ERROR_LOG(caddy.status); + return caddy.status; + } } /* lookup the other side's info - if a non-blocking form diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 30462ac4faa..8ce47c18e3b 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -131,7 +131,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, /* if we couldn't provide the allocation regex on the orted * cmd line, then we need to provide all the info here */ if (!orte_nidmap_communicated) { - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) { ORTE_ERROR_LOG(rc); return rc; } @@ -246,6 +246,22 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, return rc; } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* compute and pack the ppn regex */ + if (ORTE_SUCCESS != (rc = orte_util_nidmap_generate_ppn(jdata, &nidmap))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + free(nidmap); + return rc; + } + free(nidmap); + } + + /* compute and pack the regex of ppn */ + return ORTE_SUCCESS; } @@ -262,13 +278,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, int rc; orte_std_cntr_t cnt; orte_job_t *jdata=NULL, *daemons; - int32_t n, k, m; + int32_t n, k; opal_buffer_t *bptr; - orte_node_t *node; orte_proc_t *pptr, *dmn; orte_app_context_t *app; - bool newmap = false; int8_t flag; + char *ppn; OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, "%s odls:constructing child list", @@ -356,7 +371,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, * the storage */ jdata->jobid = ORTE_JOBID_INVALID; OBJ_RELEASE(jdata); - /* get the correct job object */ + /* get the correct job object - it will be completely filled out */ if (NULL == (jdata = orte_get_job_data_object(*job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; @@ -364,25 +379,65 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } else { opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); - } - /* ensure the map object is present */ - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - newmap = true; + /* ensure the map object is present */ + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } } - if (orte_no_vm) { - /* if we are operating novm, then mpirun will have sent us - * the complete array of procs - process it */ - for (n=0; n < jdata->procs->size; n++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { - continue; + /* if the job is fully described, then mpirun will have computed + * and sent us the complete array of procs in the orte_job_t, so we + * don't need to do anything more here */ + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + if (!ORTE_PROC_IS_HNP) { + /* extract the ppn regex */ + cnt = 1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ppn, &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; } - if (ORTE_PROC_STATE_UNDEF == pptr->state) { - /* not ready for use yet */ - continue; + /* populate the node array of the job map and the proc array of + * the job object so we know how many procs are on each node */ + if (ORTE_SUCCESS != (rc = orte_util_nidmap_parse_ppn(jdata, ppn))) { + ORTE_ERROR_LOG(rc); + free(ppn); + goto REPORT_ERROR; + } + free(ppn); + /* now assign locations to the procs */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; } + } + /* compute the ranks and add the proc objects + * to the jdata->procs array */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* and finally, compute the local and node ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + + /* now that the node array in the job map and jdata are completely filled out,. + * we need to "wireup" the procs to their nodes so other utilities can + * locate them */ + for (n=0; n < jdata->procs->size; n++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { + continue; + } + if (ORTE_PROC_STATE_UNDEF == pptr->state) { + /* not ready for use yet */ + continue; + } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* the parser will have already made the connection, but the fully described + * case won't have done it, so connect the proc to its node here */ opal_output_verbose(5, orte_odls_base_framework.framework_output, "%s GETTING DAEMON FOR PROC %s WITH PARENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -401,86 +456,37 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } OBJ_RETAIN(dmn->node); pptr->node = dmn->node; - /* add proc to node - note that num_procs for the - * node was already correctly unpacked, so don't - * increment it here */ - OBJ_RETAIN(pptr); - opal_pointer_array_add(dmn->node->procs, pptr); - - /* add the node to the map, if not already there */ - if (!ORTE_FLAG_TEST(dmn->node, ORTE_NODE_FLAG_MAPPED)) { - OBJ_RETAIN(dmn->node); - ORTE_FLAG_SET(dmn->node, ORTE_NODE_FLAG_MAPPED); - opal_pointer_array_add(jdata->map->nodes, dmn->node); - if (newmap) { - jdata->map->num_nodes++; - } - } - - /* see if it belongs to us */ - if (pptr->parent == ORTE_PROC_MY_NAME->vpid) { - /* is this child on our current list of children */ - if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { - /* not on the local list */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s[%s:%d] adding proc %s to my local list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jdata->num_local_procs++; - /* add this proc to our child list */ - OBJ_RETAIN(pptr); - ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); - opal_pointer_array_add(orte_local_children, pptr); - } - - /* if the job is in restart mode, the child must not barrier when launched */ - if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { - orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); - } - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); - } - } - } else { - /* create the map - will already have been done for the novm case */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; } - /* find our local procs */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - continue; - } - if (node->index != (int)ORTE_PROC_MY_NAME->vpid) { - continue; + /* see if it belongs to us */ + if (pptr->parent == ORTE_PROC_MY_NAME->vpid) { + /* is this child on our current list of children */ + if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { + /* not on the local list */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, + "%s[%s:%d] adding proc %s to my local list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + __FILE__, __LINE__, + ORTE_NAME_PRINT(&pptr->name))); + /* keep tabs of the number of local procs */ + jdata->num_local_procs++; + /* add this proc to our child list */ + OBJ_RETAIN(pptr); + ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); + opal_pointer_array_add(orte_local_children, pptr); } - for (m=0; m < node->procs->size; m++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, m))) { - continue; - } - if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { - /* not on the local list */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s[%s:%d] adding proc %s to my local list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jdata->num_local_procs++; - /* add this proc to our child list */ - OBJ_RETAIN(pptr); - ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); - opal_pointer_array_add(orte_local_children, pptr); - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); - } + + /* if the job is in restart mode, the child must not barrier when launched */ + if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { + orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } + /* mark that this app_context is being used on this node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); + ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); } + } + + if (!ORTE_PROC_IS_HNP && + !orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { /* compute and save bindings of local children */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); @@ -488,13 +494,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } - /* reset any node map flags we used so the next job will start clean */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); - } - } - /* if we wanted to see the map, now is the time to display it */ if (jdata->map->display_map) { orte_rmaps_base_display_map(jdata); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 677535aacf6..0c54807a7e6 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -209,7 +209,7 @@ static void files_ready(int status, void *cbdata) if (ORTE_SUCCESS != status) { ORTE_FORCED_TERMINATE(status); } else { - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } } @@ -1497,7 +1497,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, /* convert the nodes with daemons to a regex */ param = NULL; - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(¶m))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, ¶m))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/base/Makefile.am b/orte/mca/rmaps/base/Makefile.am index 41b0420847c..d2930632ea4 100644 --- a/orte/mca/rmaps/base/Makefile.am +++ b/orte/mca/rmaps/base/Makefile.am @@ -12,7 +12,7 @@ # Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -31,7 +31,8 @@ libmca_rmaps_la_SOURCES += \ base/rmaps_base_support_fns.c \ base/rmaps_base_ranking.c \ base/rmaps_base_print_fns.c \ - base/rmaps_base_binding.c + base/rmaps_base_binding.c \ + base/rmaps_base_assign_locations.c dist_ortedata_DATA = base/help-orte-rmaps-base.txt diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index b1f540241a7..beb4cee0445 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -99,7 +99,8 @@ OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t); /* * Map a job */ -ORTE_DECLSPEC int orte_rmaps_base_map_job(orte_job_t *jdata); +ORTE_DECLSPEC void orte_rmaps_base_map_job(int sd, short args, void *cbdata); +ORTE_DECLSPEC int orte_rmaps_base_assign_locations(orte_job_t *jdata); /** * Utility routines to get/set vpid mapping for the job diff --git a/orte/mca/rmaps/base/help-orte-rmaps-base.txt b/orte/mca/rmaps/base/help-orte-rmaps-base.txt index c04acf413d9..2f5f5b5d0c7 100644 --- a/orte/mca/rmaps/base/help-orte-rmaps-base.txt +++ b/orte/mca/rmaps/base/help-orte-rmaps-base.txt @@ -13,7 +13,7 @@ # Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -410,3 +410,13 @@ Either the -host or -hostfile options were given, but the number of processes to start was omitted. This combination is not supported. Please specify the number of processes to run and try again. +# +[failed-assignments] +The attempt to assign hardware locations to processes on a +compute node failed: + + Node: %s + Policy: %s + +We cannot continue - please check that the policy is in +accordance with the actual available hardware. diff --git a/orte/mca/rmaps/base/rmaps_base_assign_locations.c b/orte/mca/rmaps/base/rmaps_base_assign_locations.c new file mode 100644 index 00000000000..b1536ded0aa --- /dev/null +++ b/orte/mca/rmaps/base/rmaps_base_assign_locations.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "orte/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/rmaps/base/rmaps_private.h" + + +int orte_rmaps_base_assign_locations(orte_job_t *jdata) +{ + int rc; + orte_rmaps_base_selected_module_t *mod; + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps: assigning locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* cycle thru the available mappers until one agrees to assign + * locations for the job + */ + if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) { + /* forced selection */ + mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules); + jdata->map->req_mapper = strdup(mod->component->mca_component_name); + } + OPAL_LIST_FOREACH(mod, &orte_rmaps_base.selected_modules, orte_rmaps_base_selected_module_t) { + if (NULL == mod->module->assign_locations) { + continue; + } + if (ORTE_SUCCESS == (rc = mod->module->assign_locations(jdata))) { + return rc; + } + /* mappers return "next option" if they didn't attempt to + * process the job. anything else is a true error. + */ + if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + /* if we get here without doing the assignments, then that's an error */ + orte_show_help("help-orte-rmaps-base.txt", "failed-assignments", true, + orte_process_info.nodename, + orte_rmaps_base_print_mapping(jdata->map->mapping)); + return ORTE_ERROR; +} diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 8254bcfaf16..d5e2ac304dc 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -42,8 +42,10 @@ #include "orte/mca/rmaps/base/rmaps_private.h" -int orte_rmaps_base_map_job(orte_job_t *jdata) +void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; orte_node_t *node; int rc, i, ppx = 0; bool did_map, given, pernode = false; @@ -116,7 +118,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) /* inform the user of the error */ orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true); OPAL_LIST_DESTRUCT(&nodes); - return ORTE_ERR_BAD_PARAM; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } nprocs += slots; @@ -335,7 +339,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) int i; if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { @@ -368,15 +374,26 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } + /* reset any node map flags we used so the next job will start clean */ + for (i=0; i < jdata->map->nodes->size; i++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) { /* the map was done but nothing could be mapped * for launch as all the resources were busy */ orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } /* if we get here without doing the map, or with zero procs in @@ -386,7 +403,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) orte_show_help("help-orte-rmaps-base.txt", "failed-map", true, did_map ? "mapped" : "unmapped", jdata->num_procs, jdata->map->num_nodes); - return ORTE_ERR_INVALID_NUM_PROCS; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } /* if any node is oversubscribed, then check to see if a binding @@ -399,17 +418,29 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) } } - /* compute and save local ranks */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* compute and save location assignments */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; + } + } else { + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; + } - if (orte_no_vm) { /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } @@ -427,7 +458,11 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) } } - return ORTE_SUCCESS; + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); + + /* cleanup */ + OBJ_RELEASE(caddy); } void orte_rmaps_base_display_map(orte_job_t *jdata) diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index b297290a4d6..cb5d6a09a0c 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -49,19 +49,17 @@ #include "orte/mca/rmaps/base/base.h" static int rank_span(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { + orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, rc; + int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; - opal_list_item_t *item; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, @@ -85,18 +83,144 @@ static int rank_span(orte_job_t *jdata, * are mapped */ - vpid = jdata->num_procs; - cnt = 0; - while (cnt < app->num_procs) { - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + + cnt = 0; + while (cnt < app->num_procs) { + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + /* get the number of objects - only consider those we can actually use */ + num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, + cache_level, OPAL_HWLOC_AVAILABLE); + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: found %d objects on node %s with %d procs", + num_objs, node->name, (int)node->num_procs); + if (0 == num_objs) { + return ORTE_ERR_NOT_SUPPORTED; + } + + /* for each object */ + for (i=0; i < num_objs && cnt < app->num_procs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, + cache_level, i, OPAL_HWLOC_AVAILABLE); + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: working object %d", i); + + /* cycle thru the procs on this node */ + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already assigned */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + /* ignore procs not on this object */ + if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: proc at position %d is not on object %d", + j, i); + continue; + } + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; + + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; + } + } + } + } + } + + return ORTE_SUCCESS; +} + +static int rank_fill(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level) +{ + orte_app_context_t *app; + hwloc_obj_t obj; + int num_objs, i, j, m, n, rc; + orte_vpid_t num_ranked=0; + orte_node_t *node; + orte_proc_t *proc, *pptr; + orte_vpid_t vpid; + int cnt; + hwloc_obj_t locale; + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_fill: for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* if the ranking is fill, then we rank all the procs + * within a given object before moving on to the next + * + * Node 0 Node 1 + * Obj 0 Obj 1 Obj 0 Obj 1 + * 0 1 4 5 8 9 12 13 + * 2 3 6 7 10 11 14 15 + */ + + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + + cnt = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: found %d objects on node %s with %d procs", + "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", num_objs, node->name, (int)node->num_procs); if (0 == num_objs) { return ORTE_ERR_NOT_SUPPORTED; @@ -108,7 +232,7 @@ static int rank_span(orte_job_t *jdata, cache_level, i, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: working object %d", i); + "mca:rmaps:rank_fill: working object %d", i); /* cycle thru the procs on this node */ for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { @@ -118,7 +242,7 @@ static int rank_span(orte_job_t *jdata, /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", + "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } @@ -130,7 +254,7 @@ static int rank_span(orte_job_t *jdata, if (proc->app_idx != app->idx) { continue; } - /* protect against bozo case */ + /* protect against bozo case */ locale = NULL; if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERROR); @@ -139,19 +263,23 @@ static int rank_span(orte_job_t *jdata, /* ignore procs not on this object */ if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: proc at position %d is not on object %d", + "mca:rmaps:rank_fill: proc at position %d is not on object %d", j, i); continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); + "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; if (0 == cnt) { app->first_rank = proc->name.vpid; } cnt++; - /* insert the proc into the jdata array - no harm if already there */ + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; @@ -160,8 +288,6 @@ static int rank_span(orte_job_t *jdata, * new bookmark */ jdata->bookmark = node; - /* move to next object */ - break; } } } @@ -170,138 +296,26 @@ static int rank_span(orte_job_t *jdata, return ORTE_SUCCESS; } -static int rank_fill(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, - hwloc_obj_type_t target, - unsigned cache_level) -{ - hwloc_obj_t obj; - int num_objs, i, j, rc; - orte_vpid_t num_ranked=0; - orte_node_t *node; - orte_proc_t *proc; - orte_vpid_t vpid; - int cnt; - opal_list_item_t *item; - hwloc_obj_t locale; - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: for job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - - /* if the ranking is fill, then we rank all the procs - * within a given object before moving on to the next - * - * Node 0 Node 1 - * Obj 0 Obj 1 Obj 0 Obj 1 - * 0 1 4 5 8 9 12 13 - * 2 3 6 7 10 11 14 15 - */ - - vpid = jdata->num_procs; - cnt = 0; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - /* get the number of objects - only consider those we can actually use */ - num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level, OPAL_HWLOC_AVAILABLE); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", - num_objs, node->name, (int)node->num_procs); - if (0 == num_objs) { - return ORTE_ERR_NOT_SUPPORTED; - } - - /* for each object */ - for (i=0; i < num_objs && cnt < app->num_procs; i++) { - obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, i, OPAL_HWLOC_AVAILABLE); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: working object %d", i); - - /* cycle thru the procs on this node */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already assigned */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - /* protect against bozo case */ - locale = NULL; - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: proc at position %d is not on object %d", - j, i); - continue; - } - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - } - } - - return ORTE_SUCCESS; -} - static int rank_by(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { + orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, rc; + int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; opal_pointer_array_t objs; bool all_done; - opal_list_item_t *item; hwloc_obj_t locale; if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_span(jdata, app, nodes, target, cache_level); + return rank_span(jdata, target, cache_level); } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_fill(jdata, app, nodes, target, cache_level); + return rank_fill(jdata, target, cache_level); } /* if ranking is not spanned or filled, then we @@ -316,122 +330,140 @@ static int rank_by(orte_job_t *jdata, * 4 6 5 7 12 14 13 15 */ - /* setup the pointer array */ - OBJ_CONSTRUCT(&objs, opal_pointer_array_t); - opal_pointer_array_init(&objs, 2, INT_MAX, 2); - - vpid = jdata->num_procs; - cnt = 0; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - /* get the number of objects - only consider those we can actually use */ - num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level, OPAL_HWLOC_AVAILABLE); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: found %d objects on node %s with %d procs", - num_objs, node->name, (int)node->num_procs); - if (0 == num_objs) { - return ORTE_ERR_NOT_SUPPORTED; - } - /* collect all the objects */ - for (i=0; i < num_objs; i++) { - obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, i, OPAL_HWLOC_AVAILABLE); - opal_pointer_array_set_item(&objs, i, obj); + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; } - /* cycle across the objects, assigning a proc to each one, - * until all procs have been assigned - unfortunately, since - * more than this job may be mapped onto a node, the number - * of procs on the node can't be used to tell us when we - * are done. Instead, we have to just keep going until all - * procs are ranked - which means we have to make one extra - * pass thru the loop - * - * Perhaps someday someone will come up with a more efficient - * algorithm, but this works for now. - */ - all_done = false; - while (!all_done && cnt < app->num_procs) { - all_done = true; - /* cycle across the objects */ - for (i=0; i < num_objs && cnt < app->num_procs; i++) { - obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + /* setup the pointer array */ + OBJ_CONSTRUCT(&objs, opal_pointer_array_t); + opal_pointer_array_init(&objs, 2, INT_MAX, 2); - /* find the next proc on this object */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already ranked */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - continue; - } - /* ignore procs on other objects */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + cnt = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + + /* get the number of objects - only consider those we can actually use */ + num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, + cache_level, OPAL_HWLOC_AVAILABLE); + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: found %d objects on node %s with %d procs", + num_objs, node->name, (int)node->num_procs); + if (0 == num_objs) { + OBJ_DESTRUCT(&objs); + return ORTE_ERR_NOT_SUPPORTED; + } + /* collect all the objects */ + for (i=0; i < num_objs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, + cache_level, i, OPAL_HWLOC_AVAILABLE); + opal_pointer_array_set_item(&objs, i, obj); + } + + /* cycle across the objects, assigning a proc to each one, + * until all procs have been assigned - unfortunately, since + * more than this job may be mapped onto a node, the number + * of procs on the node can't be used to tell us when we + * are done. Instead, we have to just keep going until all + * procs are ranked - which means we have to make one extra + * pass thru the loop + * + * Perhaps someday someone will come up with a more efficient + * algorithm, but this works for now. + */ + all_done = false; + while (!all_done && cnt < app->num_procs) { + all_done = true; + /* cycle across the objects */ + for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) { + obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + /* find the next proc for this job and app_context */ + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already ranked */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + /* ignore procs not on this object */ + if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: proc at position %d is not on object %d", + j, i); + continue; + } + /* assign the vpid */ + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc at position %d is not on object %d", - j, i); - continue; + "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&objs); + return rc; + } + /* flag that one was mapped */ + all_done = false; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; } - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* flag that one was mapped */ - all_done = false; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - /* move to next object */ - break; } } } + /* cleanup */ + OBJ_DESTRUCT(&objs); } - - /* cleanup */ - OBJ_DESTRUCT(&objs); - return ORTE_SUCCESS; } -int orte_rmaps_base_compute_vpids(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes) +int orte_rmaps_base_compute_vpids(orte_job_t *jdata) { orte_job_map_t *map; + orte_app_context_t *app; orte_vpid_t vpid; - int j, cnt; + int j, m, n, cnt; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; int rc; - opal_list_item_t *item; bool one_found; map = jdata->map; @@ -445,7 +477,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by NUMA for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -460,7 +492,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by socket for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -475,7 +507,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L3cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -490,7 +522,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L2cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -505,7 +537,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L1cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -520,7 +552,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by core for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -528,6 +560,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, } ORTE_ERROR_LOG(rc); } + opal_output(0, "DONE"); return rc; } @@ -535,7 +568,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by hwthread for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -549,26 +582,83 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) || ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by node for job %s app %d on %d nodes", - ORTE_JOBID_PRINT(jdata->jobid), (int)app->idx, - (int)opal_list_get_size(nodes)); - /* bozo check */ - if (0 == opal_list_get_size(nodes)) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } + "mca:rmaps:base: computing vpids by node for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); /* assign the ranks round-robin across nodes - only one board/node * at this time, so they are equivalent */ - cnt=0; - vpid=jdata->num_procs; - one_found = true; - while (cnt < app->num_procs && one_found) { - one_found = false; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; + vpid=0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + cnt=0; + one_found = true; + while (cnt < app->num_procs && one_found) { + one_found = false; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + if (ORTE_VPID_INVALID != proc->name.vpid) { + continue; + } + proc->name.vpid = vpid++; + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + cnt++; + one_found = true; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + break; /* move on to next node */ + } + } + } + if (cnt < app->num_procs) { + ORTE_ERROR_LOG(ORTE_ERR_FATAL); + return ORTE_ERR_FATAL; + } + } + return ORTE_SUCCESS; + } + + rankbyslot: + if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) { + /* assign the ranks sequentially */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:base: computing vpids by slot for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; @@ -581,70 +671,25 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, if (proc->app_idx != app->idx) { continue; } - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; + if (ORTE_VPID_INVALID == proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:base: assigning rank %s to node %s", + ORTE_VPID_PRINT(vpid), node->name); + proc->name.vpid = vpid++; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; } - proc->name.vpid = vpid++; - /* insert the proc into the jdata array - no harm if already there */ + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } - cnt++; - one_found = true; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - break; /* move on to next node */ - } - } - } - if (cnt < app->num_procs) { - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - return ORTE_SUCCESS; - } - - rankbyslot: - if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) { - /* assign the ranks sequentially */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by slot for job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - vpid = jdata->num_procs; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - - for (j=0; j < node->procs->size; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - if (ORTE_VPID_INVALID == proc->name.vpid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: assigning rank %s to node %s", - ORTE_VPID_PRINT(vpid), node->name); - proc->name.vpid = vpid++; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; } } } diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index b9003c93f59..cf8b9b71f69 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -351,6 +351,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* the list is empty - if the HNP is allocated, then add it */ if (orte_hnp_is_allocated) { nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + OBJ_RETAIN(nd); opal_list_append(allocated_nodes, &nd->super); } else { nd = NULL; @@ -476,8 +477,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* if the hnp was not allocated, or flagged not to be used, * then remove it here */ if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { - node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - if (node == (orte_node_t*)item) { + if (0 == node->index) { opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ item = next; @@ -508,24 +508,24 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr continue; } if (node->slots > node->slots_inuse) { - /* add the available slots */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s has %d slots available", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, node->slots - node->slots_inuse)); - num_slots += node->slots - node->slots_inuse; - item = next; - continue; + /* add the available slots */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s has %d slots available", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, node->slots - node->slots_inuse)); + num_slots += node->slots - node->slots_inuse; + item = next; + continue; } if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { - /* nothing needed to do here - we don't add slots to the - * count as we don't have any available. Just let the mapper - * do what it needs to do to meet the request - */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s is fully used, but available for oversubscription", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); + /* nothing needed to do here - we don't add slots to the + * count as we don't have any available. Just let the mapper + * do what it needs to do to meet the request + */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s is fully used, but available for oversubscription", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); } else { /* if we cannot use it, remove it from list */ opal_list_remove_item(allocated_nodes, item); diff --git a/orte/mca/rmaps/base/rmaps_private.h b/orte/mca/rmaps/base/rmaps_private.h index 8950a1b76df..d9e7f9dcfe0 100644 --- a/orte/mca/rmaps/base/rmaps_private.h +++ b/orte/mca/rmaps/base/rmaps_private.h @@ -12,6 +12,7 @@ * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,9 +56,7 @@ ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata, ORTE_DECLSPEC orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata); -ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes); +ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata); ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata); diff --git a/orte/mca/rmaps/lama/.opal_ignore b/orte/mca/rmaps/lama/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/orte/mca/rmaps/lama/Makefile.am b/orte/mca/rmaps/lama/Makefile.am deleted file mode 100644 index 0512f8b10da..00000000000 --- a/orte/mca/rmaps/lama/Makefile.am +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. -# -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_ortedata_DATA = help-orte-rmaps-lama.txt - -sources = \ - rmaps_lama_module.c \ - rmaps_lama_max_tree.c \ - rmaps_lama_params.c \ - rmaps_lama.h \ - rmaps_lama_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_rmaps_lama_DSO -component_noinst = -component_install = mca_rmaps_lama.la -else -component_noinst = libmca_rmaps_lama.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_rmaps_lama_la_SOURCES = $(sources) -mca_rmaps_lama_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rmaps_lama_la_SOURCES =$(sources) -libmca_rmaps_lama_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt b/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt deleted file mode 100644 index f1b7239bb4f..00000000000 --- a/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt +++ /dev/null @@ -1,173 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. -# Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for LAMA Mapper. -# -[orte-rmaps-lama:multi-apps-and-zero-np] -RMAPS found multiple applications to be launched, with at least one that failed -to specify the number of processes to execute. When specifying multiple -applications, you must specify how many processes of each to launch via the --np argument. -# -[orte-rmaps-lama:oversubscribe] -RMaps LAMA detected oversubscription after mapping %d of %d processes. -Since you have asked not to oversubscribe the resources the job will not -be launched. If you would instead like to oversubscribe the resources -try using the --oversubscribe option to mpirun. -# -[orte-rmaps-lama:no-resources-available] -RMaps LAMA detected that there are not enough resources to map the -remainder of the job. Check the command line options, and the number of -nodes allocated to this job. - Application Context : %d - # of Processes Successfully Mapped: %d - # of Processes Requested : %d - Mapping : %s - Binding : %s - MPPR : %s - Ordering : %s -# -[orte-rmaps-lama:merge-conflict-bad-prune-src] -RMaps LAMA detected that it needed to prune a level of the hierarchy that -was necessary for one of the command line parameters. Check your allocation -and the options below to make sure they are correct. - Conflicting Level Description: %s - Mapping : %s - Binding : %s - MPPR : %s - Ordering : %s -# -[invalid mapping option] -The specified mapping option is not supported with the LAMA rmaps -mapper: - - Specified mapping option: %s - Reason it is invalid: %s - -LAMA supports the following options to the mpirun --map-by option: - - node, numa, socket, l1cache, l2cache, l3cache, core, hwthread, slot - -Alternatively, LAMA supports specifying a sequence of letters in the -rmaps_lama_map MCA parameter; each letter indicates a "direction" for -mapping. The rmaps_lama_map MCA parameter is richer/more flexible -than the --may-by CLI option. If rmaps_lama_map is specified, the -following letters must be specified: - - h: hardware thread - c: processor core - s: processor socket - n: node (server) - -The following may also optionally be included in the mapping string: - - N: NUMA node - L1: L1 cache - L2: L2 cache - L3: L3 cache - -For example, the two commands below are equivalent: - - mpirun --mca rmaps lama --mca rmaps_lama_map csNh ... - mpirun --mca rmaps lama --map-by core ... -# -[invalid binding option] -The specified binding option is not supported with the LAMA rmaps -mapper: - - Specified binding option: %s - Reason it is invalid: %s - -LAMA binding options can be specified via the mpirun --bind-to command -line option or rmaps_lama_bind MCA param: - - --bind-to rmaps_lama_binding - Locality option option - ---------------- --------- ------------------ - Hardware thread hwthread h - Processor core core c - Processor socket socket s - NUMA node numa N - L1 cache l1cache L1 - L2 cache l2cache L2 - L3 cache l3cache L3 - Node (server) node n - -The --bind-to option assumes a single locality (e.g., bind each MPI -process to a single core, socket, etc.). The rmaps_lama_bind MCA -param requires an integer specifying how many localities to which to -bind. For example, the following two command lines are equivalent, -and bind each MPI process to a single core: - - mpirun --btl rmaps lama --mca rmaps_lama_bind 1c ... - mpirun --btl rmaps lama --bind-to core ... - -The rmaps_lama_bind MCA parameter is more flexible than the --bind-to -CLI option, because it allows binding to multiple resources. For -example, specifing an rmaps_lama_bind value of "2c" binds each MPI -process to two cores. -# -[invalid ordering option] -The specified ordering option is not supported. - - Specified ordering option: %s - -The LAMA ordering can be specified via the rmaps_lama_ordering MCA -parameter. - -Two options are supported for ordering ranks in MPI_COMM_WORLD (MCW): - - s: Sequential. MCW rank ordering is sequential by hardware thread - across all nodes. E.g., MCW rank 0 is the first process on node - 0; MCW rank 1 is the second process on node 0, and so on. - n: Natural. MCW rank ordering follows the "natural" mapping layout. - For example, in a by-socket layout, MCW rank 0 is the first - process on the 1st socket on node 0. MCW rank 1 is then the - first process on the 2nd socket on node 0. And so on. -# -[invalid mppr option] -The specified Max Processes Per Resource (MPPR) value is invalid (in -the rmaps_lama_mppr MCA paramter): - - Specified MPPR: %s - Reason is is invalid: %s - -The MPPR is a comma-delimited list of specifications indicating how -many processes are allowed on a given type of resource before an MPI -job is considered to have oversubscribed that resource. Each -specification is a token in the format of "NUMBER:RESOURCE". For -example, the default MPPR of "1:c" means that Open MPI will map one -process per processor core before considering cores to be -oversubscribed. - -Multiple specifications may be useful; for example "1:c,2:s" maintains -the default one-process-per-core limitation, but places an additional -limitation of only two processes per processor socket (assuming that -there are more than two cores per socket). - -The LAMA MPPR specifications are set via the rmaps_lama_mppr MCA -parameter. The following resources can be specified: - - Hardware thread h - Processor core c - Processor socket s - NUMA node N - L1 cache L1 - L2 cache L2 - L3 cache L3 - Node (server) n -# -[internal error] -An unexpected internal error occurred in the LAMA mapper; your job -will now fail. Sorry. - - File: %s - Message: %s diff --git a/orte/mca/rmaps/lama/owner.txt b/orte/mca/rmaps/lama/owner.txt deleted file mode 100644 index 0cc0384f0eb..00000000000 --- a/orte/mca/rmaps/lama/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: CISCO -status: maintenance diff --git a/orte/mca/rmaps/lama/rmaps_lama.h b/orte/mca/rmaps/lama/rmaps_lama.h deleted file mode 100644 index 8cb830f861e..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2013-2017 Cisco Systems, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Resource Mapping - */ -#ifndef ORTE_RMAPS_LAMA_H -#define ORTE_RMAPS_LAMA_H - -#include "orte_config.h" - -#include "opal/class/opal_tree.h" - -#include "orte/mca/rmaps/rmaps.h" - -BEGIN_C_DECLS - -ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_lama_component; - -extern orte_rmaps_base_module_t orte_rmaps_lama_module; - - -/********************************* - * Structures & Defines - *********************************/ -/* - * JJH: Can we reuse the opal_hwloc_level_t data structure in - * opal/mca/hwloc/hwloc-internal.h - */ -typedef enum { - LAMA_LEVEL_MACHINE = 0, - LAMA_LEVEL_BOARD = 1, - LAMA_LEVEL_NUMA = 2, - LAMA_LEVEL_SOCKET = 3, - LAMA_LEVEL_CACHE_L3 = 4, - LAMA_LEVEL_CACHE_L2 = 5, - LAMA_LEVEL_CACHE_L1 = 6, - LAMA_LEVEL_CORE = 7, - LAMA_LEVEL_PU = 8, - LAMA_LEVEL_UNKNOWN = 9 -} rmaps_lama_level_type_t; - -typedef enum { - LAMA_ORDER_NATURAL = 0, - LAMA_ORDER_SEQ = 1 -} rmaps_lama_order_type_t; - -struct rmaps_lama_level_info_t { - rmaps_lama_level_type_t type; - int max_resources; -}; -typedef struct rmaps_lama_level_info_t rmaps_lama_level_info_t; - -/* - * Structure to attach to the hwloc tree - * Accounting for mppr - */ -struct rmaps_lama_hwloc_user_t { - opal_object_t super; - - opal_pointer_array_t *node_mppr; -}; -typedef struct rmaps_lama_hwloc_user_t rmaps_lama_hwloc_user_t; -OBJ_CLASS_DECLARATION(rmaps_lama_hwloc_user_t); - -struct rmaps_lama_node_mppr_t { - int max; - int cur; -}; -typedef struct rmaps_lama_node_mppr_t rmaps_lama_node_mppr_t; - -rmaps_lama_level_type_t lama_type_str_to_enum(char *param); -char * lama_type_enum_to_str(rmaps_lama_level_type_t param); - - -/********************************* - * Command Line Interface Parsing - *********************************/ -/* - * User defined command line interface (CLI) arguments - */ -extern char * rmaps_lama_cmd_map; -extern char * rmaps_lama_cmd_bind; -extern char * rmaps_lama_cmd_mppr; -extern char * rmaps_lama_cmd_ordering; -extern bool rmaps_lama_timing_enabled; -extern bool rmaps_lama_can_oversubscribe; -extern bool rmaps_lama_am_oversubscribing; - -/* - * Internal representations of command line arguments - */ -extern int lama_mapping_num_layouts; -extern rmaps_lama_level_type_t *lama_mapping_layout; - -extern rmaps_lama_level_type_t lama_binding_level; - -extern rmaps_lama_level_info_t *lama_mppr_levels; -extern int lama_mppr_num_levels; - -/* - * Homogeneous system optimization - */ -extern bool lama_mppr_max_tree_homogeneous_system; - -/* - * Maximum length of digits in CLI - */ -#define MAX_BIND_DIGIT_LEN 4 - -int rmaps_lama_process_alias_params(orte_job_t *jdata); - -int rmaps_lama_parse_mapping(char *layout, - rmaps_lama_level_type_t **layout_types, - rmaps_lama_level_type_t **layout_types_sorted, - int *num_types); -int rmaps_lama_parse_binding(char *layout, - rmaps_lama_level_type_t *binding_level, - int *num_types); -int rmaps_lama_parse_mppr(char *layout, - rmaps_lama_level_info_t **mppr_levels, - int *num_types); -int rmaps_lama_parse_ordering(char *layout, - rmaps_lama_order_type_t *order); - -bool rmaps_lama_ok_to_prune_level(rmaps_lama_level_type_t level); - -/********************************* - * Max Tree Structure - *********************************/ -struct rmaps_lama_max_tree_item_t { - opal_tree_item_t tree_element; - - rmaps_lama_level_type_t type; -}; -typedef struct rmaps_lama_max_tree_item_t rmaps_lama_max_tree_item_t; - - -/* - * Union all topologies into the max tree - */ -int rmaps_lama_build_max_tree(orte_job_t *jdata, opal_list_t *node_list, - opal_tree_t * max_tree, bool *is_homogeneous); - -/* - * Find a matching subtree - */ -hwloc_obj_t * rmaps_lama_find_nth_subtree_match(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - rmaps_lama_level_type_t lama_key); -hwloc_obj_t * rmaps_lama_find_parent(hwloc_topology_t hwloc_topo, - hwloc_obj_t *child_obj, - rmaps_lama_level_type_t lama_key); - -/* - * Create Empty Tree - */ -opal_tree_t * rmaps_lama_create_empty_max_tree(void); - -/* - * Pretty Print - */ -void rmaps_lama_max_tree_pretty_print_tree(opal_tree_t *tree); - -END_C_DECLS - -#endif /* ORTE_RMAPS_LAMA_H */ diff --git a/orte/mca/rmaps/lama/rmaps_lama_component.c b/orte/mca/rmaps/lama/rmaps_lama_component.c deleted file mode 100644 index e8734dbec64..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_component.c +++ /dev/null @@ -1,136 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -#include "rmaps_lama.h" - -/* - * Local functions - */ - -static int orte_rmaps_lama_register(void); -static int orte_rmaps_lama_query(mca_base_module_t **module, int *priority); - -static int module_priority; - -char * rmaps_lama_cmd_map = NULL; -char * rmaps_lama_cmd_bind = NULL; -char * rmaps_lama_cmd_mppr = NULL; -char * rmaps_lama_cmd_ordering = NULL; -bool rmaps_lama_timing_enabled = false; -bool rmaps_lama_can_oversubscribe = false; -bool rmaps_lama_am_oversubscribing = false; - -orte_rmaps_base_component_t mca_rmaps_lama_component = { - .base_version = { - ORTE_RMAPS_BASE_VERSION_2_0_0, - - .mca_component_name = "lama", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_query_component = orte_rmaps_lama_query, - .mca_register_component_params = orte_rmaps_lama_register, - }, - .base_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, -}; - - -static int orte_rmaps_lama_register(void) -{ - mca_base_component_t *c = &mca_rmaps_lama_component.base_version; - - /* JMS Artifically low for now */ - module_priority = 0; - (void) mca_base_component_var_register (c, "priority", "Priority of the LAMA rmaps component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &module_priority); - - rmaps_lama_timing_enabled = false; - (void) mca_base_component_var_register (c, "timing", - "Enable timing information. [Default = disabled]", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_timing_enabled); - - rmaps_lama_cmd_map = NULL; - (void) mca_base_component_var_register (c, "map", "LAMA Map: Process layout iteration ordering (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_map); - - rmaps_lama_cmd_bind = NULL; - (void) mca_base_component_var_register (c, "bind", "LAMA Bind: Bind to the specified number of resources (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_bind); - - rmaps_lama_cmd_mppr = NULL; - (void) mca_base_component_var_register (c, "mppr", "LAMA MPPR: Maximum number of the specified resources available (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_mppr); - - rmaps_lama_cmd_ordering = NULL; - (void) mca_base_component_var_register (c, "ordering", "LAMA Ordering: Ordering (s) sequential, (n) natural - Default: n (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_ordering); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Priority %3d", - module_priority); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Map : %s", - (NULL == rmaps_lama_cmd_map) ? "NULL" : rmaps_lama_cmd_map); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bind : %s", - (NULL == rmaps_lama_cmd_bind) ? "NULL" : rmaps_lama_cmd_bind); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: MPPR : %s", - (NULL == rmaps_lama_cmd_mppr) ? "NULL" : rmaps_lama_cmd_mppr); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Order : %s", - (NULL == rmaps_lama_cmd_ordering) ? "NULL" : rmaps_lama_cmd_ordering); - - return ORTE_SUCCESS; -} - - -static int orte_rmaps_lama_query(mca_base_module_t **module, int *priority) -{ - /* Only run on the HNP */ - - *priority = module_priority; - *module = (mca_base_module_t *)&orte_rmaps_lama_module; - - return ORTE_SUCCESS; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_max_tree.c b/orte/mca/rmaps/lama/rmaps_lama_max_tree.c deleted file mode 100644 index a1183028b3b..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_max_tree.c +++ /dev/null @@ -1,1182 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * Max Tree Support Functions - * - */ -#include "rmaps_lama.h" - -#include "orte/util/show_help.h" - -#include "orte/mca/errmgr/errmgr.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -/********************************* - * Max Tree Construction - *********************************/ -/* - * Convert an hwloc tree to an opal_tree - */ -static int rmaps_lama_convert_hwloc_tree_to_opal_tree(opal_tree_t *opal_tree, - hwloc_topology_t *hwloc_topo); - -/* - * Convert an hwloc subtree to an opal subtree - */ -static int rmaps_lama_convert_hwloc_subtree(hwloc_obj_t obj, - opal_tree_item_t *parent_item); - -/* - * Convert LAMA key to HWLOC key/depth - */ -static int rmaps_lama_convert_lama_key_to_hwloc_key(rmaps_lama_level_type_t lama_key, - hwloc_obj_type_t *hwloc_key, int *depth); - -/* - * Convert HWLOC key/depth to LAMA key - */ -static int rmaps_lama_convert_hwloc_key_to_lama_key(hwloc_obj_type_t hwloc_key, int depth, - rmaps_lama_level_type_t *lama_key); - -/* - * Compare two HWLOC topologies for similar structure - */ -static int rmaps_lama_hwloc_compare_topos(hwloc_topology_t *left, hwloc_topology_t *right); -static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right); - -/* - * Merge two opal_trees - */ -static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *into_tree, - opal_tree_item_t *src_parent, opal_tree_item_t *into_parent); - -/* - * Prune the max tree to just those levels specified - */ -static int rmaps_lama_prune_max_tree(opal_tree_t *max_tree, opal_tree_item_t *parent_item); - -/* - * Annotate the hwloc tree for MPPR accounting - */ -static int rmaps_lama_annotate_node_for_mppr(orte_node_t *node, hwloc_obj_t obj); - -/* - * Access the MPPR for the specified key - */ -static int rmaps_lama_get_mppr_for_key(orte_node_t *node, rmaps_lama_level_type_t lama_key); - -/* - * Recursive core of nth_subtree_match - */ -static int rmaps_lama_find_nth_subtree_match_core(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - int *num_found, - hwloc_obj_type_t hwloc_key, - int depth, - hwloc_obj_t *cur_child); - -static void rmaps_lama_max_tree_item_construct(rmaps_lama_max_tree_item_t *item) -{ - item->type = LAMA_LEVEL_UNKNOWN; -} - - -/********************************* - * Max Tree Accessors/Functions - *********************************/ -OBJ_CLASS_INSTANCE(rmaps_lama_max_tree_item_t, - opal_tree_item_t, - rmaps_lama_max_tree_item_construct, NULL); - -static int lama_max_tree_comp(opal_tree_item_t *item, void *key); -static int lama_max_tree_serialize(opal_tree_item_t *item, opal_buffer_t *buffer); -static int lama_max_tree_deserialize(opal_buffer_t *buffer, opal_tree_item_t **item); -static void * lama_max_tree_get_key(opal_tree_item_t *item); - - -/********************************* - * Max Tree Pretty Print - *********************************/ -static char * rmaps_lama_max_tree_pretty_print_subtree_element_get(opal_tree_t *tree, - opal_tree_item_t *parent, - int level); -static void pretty_print_subtree(opal_tree_t *tree, opal_tree_item_t *parent, int level); -static void pretty_print_subtree_element(opal_tree_t *tree, opal_tree_item_t *parent, int level); - - -/********************************* - * Function Defintions - *********************************/ -int rmaps_lama_build_max_tree(orte_job_t *jdata, opal_list_t *node_list, - opal_tree_t * max_tree, bool *is_homogeneous) -{ - int ret; - opal_tree_t *tmp_tree = NULL; - hwloc_topology_t topo, *last_topo = NULL; - orte_node_t *cur_node = NULL; - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Building the Max Tree..."); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - /* - * Assume homogeneous system, unless otherwise noted - */ - *is_homogeneous = true; - - /* - * Process all other unique trees from remote daemons who are in - * this allocation - */ - for(cur_node = (orte_node_t*)opal_list_get_first(node_list); - cur_node != (orte_node_t*)opal_list_get_end(node_list); - cur_node = (orte_node_t*)opal_list_get_next(cur_node) ) { - if (NULL == (topo = cur_node->topology)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- No Tree Available: %s (skipping)", cur_node->name); - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Converting Remote Tree: %s", cur_node->name); - - /* - * Convert to opal_tree - */ - tmp_tree = rmaps_lama_create_empty_max_tree(); - rmaps_lama_convert_hwloc_tree_to_opal_tree(tmp_tree, &topo); - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - rmaps_lama_max_tree_pretty_print_tree(tmp_tree); - } - - /* - * Compare the current and last topologies if we are still considering - * this max tree to represent a homogeneous system. - */ - if( *is_homogeneous ) { - if( NULL == last_topo ) { - last_topo = &topo; - } else { - if( 0 != rmaps_lama_hwloc_compare_topos(last_topo, &topo) ) { - *is_homogeneous = false; - } - } - } - - /* - * Prune the input tree so that is only contains levels that the user - * asked for. - */ - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Pruning input Tree..."); - } - if( ORTE_SUCCESS != (ret = rmaps_lama_prune_max_tree(tmp_tree, opal_tree_get_root(tmp_tree))) ) { - return ret; - } - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Input Tree... - Post Prune"); - rmaps_lama_max_tree_pretty_print_tree(tmp_tree); - } - - /* - * Merge into max_tree - */ - if( opal_tree_is_empty(max_tree) ) { - opal_tree_dup(tmp_tree, max_tree); - } else { - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(tmp_tree, - max_tree, - opal_tree_get_root(tmp_tree), - opal_tree_get_root(max_tree) ))) { - return ret; - } - } - - /* - * Release and move on... - */ - OBJ_RELEASE(tmp_tree); - tmp_tree = NULL; - } - - - /* - * Fill out the MPPR accounting information for each node - */ - for(cur_node = (orte_node_t*)opal_list_get_first(node_list); - cur_node != (orte_node_t*)opal_list_get_end(node_list); - cur_node = (orte_node_t*)opal_list_get_next(cur_node) ) { - if( ORTE_SUCCESS != (ret = rmaps_lama_annotate_node_for_mppr(cur_node, - hwloc_get_obj_by_depth(cur_node->topology, 0, 0))) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - /* - * JJH: NEEDS TESTING - * Note: This check is in place, but not used at the moment due to lack of - * system availability. Pending system availability and further testing, - * just assume heterogeneous. - */ - *is_homogeneous = false; - - /* - * Display the final Max Tree - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Final Max Tree... - %s system", - (*is_homogeneous ? "Homogeneous" : "Heterogeneous") ); - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - rmaps_lama_max_tree_pretty_print_tree(max_tree); - } - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_tree_to_opal_tree(opal_tree_t *opal_tree, hwloc_topology_t *hwloc_topo) -{ - hwloc_obj_t topo_root; - - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Converting Topology:"); - /* opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); */ - opal_dss.dump(0, *hwloc_topo, OPAL_HWLOC_TOPO); - } - - topo_root = hwloc_get_root_obj(*hwloc_topo); - - rmaps_lama_convert_hwloc_subtree(topo_root, - opal_tree_get_root(opal_tree)); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_subtree(hwloc_obj_t obj, - opal_tree_item_t *parent_item) -{ - rmaps_lama_max_tree_item_t *max_tree_item = NULL; - char * key_child_str = NULL; - char * key_parent_str = NULL; - - while (obj) { - /* - * Create new tree item - */ - max_tree_item = OBJ_NEW(rmaps_lama_max_tree_item_t); - - /* - * Convert the HWLOC object to the LAMA key - */ - rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, - obj->attr->cache.depth, - &(max_tree_item->type)); - - /* - * Append tree item to parent. Unless it is the same as the - * parent (L1 instruction vs data cache). JJH: Newer versions - * of hwloc can differentiate from the obj->attr->cache.type. - */ - if( NULL != obj->parent && - obj->parent->type == obj->type && - obj->parent->attr->cache.depth == obj->attr->cache.depth ) { - key_child_str = lama_type_enum_to_str(max_tree_item->type); - key_parent_str = lama_type_enum_to_str(((rmaps_lama_max_tree_item_t*)parent_item)->type); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Identical level detected: " - "Child [%s] vs Parent [%s]", - key_child_str, key_parent_str); - free(key_child_str); - free(key_parent_str); - - /* - * Add descendants if they exist - */ - if (obj->first_child) { - rmaps_lama_convert_hwloc_subtree(obj->first_child, - parent_item); - } - } else { - opal_tree_add_child(parent_item, &max_tree_item->tree_element); - - /* - * Add descendants if they exist - */ - if (obj->first_child) { - rmaps_lama_convert_hwloc_subtree(obj->first_child, - &max_tree_item->tree_element); - } - } - - /* - * Advance to next sibling - */ - obj = obj->next_sibling; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_annotate_node_for_mppr(orte_node_t *node, hwloc_obj_t obj) -{ - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - rmaps_lama_level_type_t lama_key; - opal_hwloc_topo_data_t *opal_hwloc_topo = NULL; - int i; - - /* - * Attach our user pointer to the topology, if it is not already there. - * We will fill it in as needed later. - * - * Note: opal/mca/hwloc/base/hwloc_base_util.c attaches their own object - * to the userdata. There is a pointer in that structure we can use without - * interfering with what OPAL is trying to do. - */ - if( NULL == obj->userdata ) { - /* Some objects may not have topo data associated with them - * JJH: This is memory leak :/ Fix. - */ - obj->userdata = (void*)OBJ_NEW(opal_hwloc_topo_data_t); - } - if( NULL != obj->userdata ) { - opal_hwloc_topo = (opal_hwloc_topo_data_t*)(obj->userdata); - - if( NULL == opal_hwloc_topo->userdata ) { - hwloc_userdata = OBJ_NEW(rmaps_lama_hwloc_user_t); - opal_hwloc_topo->userdata = hwloc_userdata; - } else { - hwloc_userdata = (rmaps_lama_hwloc_user_t*)(opal_hwloc_topo->userdata); - } - } - - - /* - * Add node information if it is not already there - */ - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - if( NULL == mppr_accounting ) { - /* - * Add MPPR accounting for this node associated with this object - */ - rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, obj->attr->cache.depth, &lama_key); - - mppr_accounting = (rmaps_lama_node_mppr_t*)malloc(sizeof(rmaps_lama_node_mppr_t)); - mppr_accounting->max = rmaps_lama_get_mppr_for_key(node, lama_key); - mppr_accounting->cur = 0; - - opal_pointer_array_set_item(hwloc_userdata->node_mppr, node->index, mppr_accounting); - } - - - /* - * Decend tree - */ - for(i = 0; i < (int)obj->arity; ++i ) { - rmaps_lama_annotate_node_for_mppr(node, - obj->children[i]); - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_get_mppr_for_key(orte_node_t *node, rmaps_lama_level_type_t lama_key) -{ - int i; - - for( i = 0; i < lama_mppr_num_levels; ++i ) { - if( lama_key == lama_mppr_levels[i].type ) { - return lama_mppr_levels[i].max_resources; - } - } - - return -1; -} - -static int rmaps_lama_convert_lama_key_to_hwloc_key(rmaps_lama_level_type_t lama_key, hwloc_obj_type_t *hwloc_key, int *depth) -{ - *depth = 0; - - switch(lama_key) { - case LAMA_LEVEL_MACHINE: - *hwloc_key = HWLOC_OBJ_MACHINE; - break; - /* Note: HWLOC does not support boards */ -#if 0 - case LAMA_LEVEL_BOARD: - *hwloc_key = HWLOC_OBJ_MACHINE; - break; -#endif - case LAMA_LEVEL_SOCKET: - *hwloc_key = HWLOC_OBJ_SOCKET; - break; - case LAMA_LEVEL_CORE: - *hwloc_key = HWLOC_OBJ_CORE; - break; - case LAMA_LEVEL_PU: - *hwloc_key = HWLOC_OBJ_PU; - break; - case LAMA_LEVEL_CACHE_L1: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 1; - break; - case LAMA_LEVEL_CACHE_L2: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 2; - break; - case LAMA_LEVEL_CACHE_L3: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 3; - break; - case LAMA_LEVEL_NUMA: - *hwloc_key = HWLOC_OBJ_NODE; - break; - default: - *hwloc_key = HWLOC_OBJ_TYPE_MAX; - break; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_key_to_lama_key(hwloc_obj_type_t hwloc_key, int depth, rmaps_lama_level_type_t *lama_key) -{ - switch(hwloc_key) { - case HWLOC_OBJ_MACHINE: - *lama_key = LAMA_LEVEL_MACHINE; - break; - /* Node: HWLOC does not support boards */ -#if 0 - case HWLOC_OBJ_BOARD: - *lama_key = LAMA_LEVEL_BOARD; - break; -#endif - case HWLOC_OBJ_SOCKET: - *lama_key = LAMA_LEVEL_SOCKET; - break; - case HWLOC_OBJ_CORE: - *lama_key = LAMA_LEVEL_CORE; - break; - case HWLOC_OBJ_PU: - *lama_key = LAMA_LEVEL_PU; - break; - case HWLOC_OBJ_CACHE: - if( 1 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L1; - } - else if( 2 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L2; - } - else if( 3 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L3; - } - else { - *lama_key = LAMA_LEVEL_UNKNOWN; - } - break; - case HWLOC_OBJ_NODE: - *lama_key = LAMA_LEVEL_NUMA; - break; - default: - *lama_key = LAMA_LEVEL_UNKNOWN; - break; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_hwloc_compare_topos(hwloc_topology_t *left, hwloc_topology_t *right) -{ - hwloc_obj_t left_root; - hwloc_obj_t right_root; - - /* - * Note: I hope that there is a 'better' way of doing this natively with - * HWLOC, but it is not obvious if they have the ability to compare - * topologies. So do a depth first comparison of the trees. - * You may be able to use the below: - * OPAL_EQUAL != opal_dss.compare(*last_topo, topo, OPAL_HWLOC_TOPO); - */ - - left_root = hwloc_get_obj_by_depth(*left, 0, 0); - right_root = hwloc_get_obj_by_depth(*right, 0, 0); - - return rmaps_lama_hwloc_compare_subtrees(left_root, right_root); -} - -static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right) -{ - int i, ret; - - /* - * Check Types - */ - if( 0 != (ret = hwloc_compare_types(left->type, right->type)) ) { - return ret; - } - - /* - * Check 'arity' at this level - */ - if( left->arity > right->arity ) { - return -1; - } - else if( left->arity < right->arity ) { - return 1; - } - - /* - * Check all subtrees - */ - for(i = 0; i < (int)left->arity; ++i ) { - if( 0 != (ret = rmaps_lama_hwloc_compare_subtrees(left->children[i], - right->children[i])) ) { - return ret; - } - } - - /* - * Subtree is the same if we get here - */ - return 0; -} - -static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *max_tree, - opal_tree_item_t *src_parent, opal_tree_item_t *max_parent) -{ - int ret, exit_status = ORTE_SUCCESS; - rmaps_lama_level_type_t *key_src, *key_max; - opal_tree_item_t *child_item = NULL, *max_grandparent = NULL; - opal_tree_item_t *max_child_item = NULL; - int num_max, num_src; - int i; - char *key_src_str = NULL; - char *key_max_str = NULL; -#if 1 - char *str = NULL; -#endif - - /* - * Basecase - */ - if( NULL == src_parent ) { - return ORTE_SUCCESS; - } - - key_src = (rmaps_lama_level_type_t*)src_tree->get_key(src_parent); - key_max = (rmaps_lama_level_type_t*)max_tree->get_key(max_parent); - - key_src_str = lama_type_enum_to_str(*key_src); - key_max_str = lama_type_enum_to_str(*key_max); - - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: CHECK: Merge Trees: Keys Src (%2d - %s) vs Max (%2d - %s)", - *key_src, key_src_str, *key_max, key_max_str); - } - - /* - * Make sure keys at this level match. - * - * JJH: Give up if they do not match. - * JJH: We should pick a victim and prune from the tree - * JJH: preferably from the 'native' tree. - */ - if( 0 != max_tree->comp(max_parent, src_tree->get_key(src_parent)) ) { - /* - * If the source conflicts due to cache, iterate to children to find a match. - * JJH: Double check this for different heterogenous systems - */ - if( LAMA_LEVEL_CACHE_L3 == *key_src || - LAMA_LEVEL_CACHE_L2 == *key_src || - LAMA_LEVEL_CACHE_L1 == *key_src || - LAMA_LEVEL_NUMA == *key_src ) { - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Merge Trees: " - "Src with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]", - *key_src, key_src_str, *key_max, key_max_str); - - /* - * If we are pruning a cache level, then check to make sure it is - * not important to the process layout. - */ - if( !rmaps_lama_ok_to_prune_level(*key_src) ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:merge-conflict-bad-prune-src", - true, - key_src_str, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * If the number of children at this pruned level was larger than - * the max tree arity at this level, then duplicate the max_tree - * element the approprate number of times - */ - max_grandparent = opal_tree_get_parent(max_parent); - num_max = opal_tree_num_children(max_grandparent); - num_src = opal_tree_num_children(src_parent); - - for(i = 0; i < (num_src - num_max); ++i ) { -#if 1 - str = rmaps_lama_max_tree_pretty_print_subtree_element_get(max_tree, max_parent, 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Merge: Appending child %s - post prune", - str); - free(str); -#endif - /* Duplicate max child subtree */ - opal_tree_copy_subtree(max_tree, max_parent, max_tree, max_grandparent); - } - - /* - * Iterate to children, until we find a match - */ - for(child_item = opal_tree_get_first_child(src_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item) ) { - - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree, - max_tree, - child_item, - max_parent)) ) { - exit_status = ret; - goto cleanup; - } - } - - exit_status = ORTE_SUCCESS; - goto cleanup; - } - /* - * If the max tree conflicts due to cache, then we need to prune the - * max tree until it matches. - * JJH: If we are pruning a level of the hierarchy then make sure we - * JJH: don't need it for the process layout. - */ - else if( LAMA_LEVEL_CACHE_L3 == *key_max || - LAMA_LEVEL_CACHE_L2 == *key_max || - LAMA_LEVEL_CACHE_L1 == *key_max || - LAMA_LEVEL_NUMA == *key_max ) { - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Merge Trees: " - "Max with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]", - *key_src, key_src_str, *key_max, key_max_str); - - /* - * If we are pruning a cache level, then check to make sure it is - * not important to the process layout. - */ - if( !rmaps_lama_ok_to_prune_level(*key_max) ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:merge-conflict-bad-prune-src", - true, - key_max_str, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } - - max_child_item = opal_tree_get_first_child(max_parent); - /* Prune parent */ - opal_tree_remove_item(max_tree, max_parent); - - /* Try again with child */ - exit_status = rmaps_lama_merge_trees(src_tree, - max_tree, - src_parent, - max_child_item); - goto cleanup; - } - - /* - * If we cannot resolve it, give up. - */ - opal_output(0, "mca:rmaps:lama: Error: Merge Trees: " - "Different Keys Src (%2d - %s) vs Max (%2d - %s) - Do not know how to resolve - give up!", - *key_src, key_src_str, *key_max, key_max_str); - - exit_status = ORTE_ERROR; - goto cleanup; - } - - num_max = opal_tree_num_children(max_parent); - num_src = opal_tree_num_children(src_parent); - - /* - * If the 'native' tree has more children than the 'max' tree. - * Add the missing children to the 'max' tree. - */ - if( num_max < num_src ) { - i = 0; - for(child_item = opal_tree_get_first_child(src_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item)) { - if(i >= num_max ) { -#if 1 - str = rmaps_lama_max_tree_pretty_print_subtree_element_get(src_tree, child_item, 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Merge: Appending child %s", - str); - free(str); -#endif - /* Add child's subtree to max */ - opal_tree_copy_subtree(src_tree, child_item, max_tree, max_parent); - } - ++i; - } - } - - /* - * Recursively search all children of 'native' tree. - * - * Note: Only need to add the children to the 'left-most' branch of the - * 'max' tree since that is the only branch that is searched during mapping. - * But do the whole thing for good measure. - */ - for( child_item = opal_tree_get_first_child(src_parent), - max_child_item = opal_tree_get_first_child(max_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item), - max_child_item = opal_tree_get_next_sibling(max_child_item) ) { - - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree, - max_tree, - child_item, - max_child_item)) ) { - exit_status = ret; - goto cleanup; - } - } - - cleanup: - if( NULL != key_src_str ) { - free(key_src_str); - key_src_str = NULL; - } - - if( NULL != key_max_str ) { - free(key_max_str); - key_max_str = NULL; - } - - return exit_status; -} - -static int rmaps_lama_prune_max_tree(opal_tree_t *max_tree, opal_tree_item_t *parent_item) -{ - int ret; - opal_tree_item_t *child_item = NULL, *next_item; - int i; - bool found; - rmaps_lama_level_type_t *key_max; - char *tmp_str = NULL; - - /* - * Basecase - */ - if( NULL == parent_item ) { - return ORTE_SUCCESS; - } - - /* - * Recursively decend tree - Depth first - * Basecase: No children, loop skipped - */ - child_item = opal_tree_get_first_child(parent_item); - while( child_item != NULL ) { - /* Do this before the recursive call, since it might remove this - * child so we need to preserve a pointer to the next sibling. - */ - next_item = opal_tree_get_next_sibling(child_item); - - if( ORTE_SUCCESS != (ret = rmaps_lama_prune_max_tree(max_tree, - child_item)) ) { - return ret; - } - - child_item = next_item; - } - - key_max = (rmaps_lama_level_type_t*)max_tree->get_key(parent_item); - - /* - * Check keys against the user supplied layout - */ - found = false; - for(i = 0; i < lama_mapping_num_layouts; ++i ) { - if( 0 == max_tree->comp(parent_item, &lama_mapping_layout[i]) ) { - found = true; - break; - } - } - - if( !found ) { - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - tmp_str = lama_type_enum_to_str(*key_max); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Before pruning %s", - tmp_str); - free(tmp_str); - rmaps_lama_max_tree_pretty_print_tree(max_tree); - } - - opal_tree_remove_item(max_tree, parent_item); - - return ORTE_SUCCESS; - } - - return ORTE_SUCCESS; -} - - -hwloc_obj_t * rmaps_lama_find_nth_subtree_match(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - rmaps_lama_level_type_t lama_key) -{ - hwloc_obj_t *cur_child = NULL; - hwloc_obj_type_t hwloc_key; - int depth; - int num_found; -#if 0 - char str[128]; -#endif - - cur_child = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - - /* - * Convert LAMA key to HWLOC key - */ - rmaps_lama_convert_lama_key_to_hwloc_key(lama_key, &hwloc_key, &depth); - - /* - * Decend tree looking for the n'th matching subtree - */ - num_found = -1; - rmaps_lama_find_nth_subtree_match_core(hwloc_topo, - parent_obj, - nth, - &num_found, - hwloc_key, - depth, - cur_child); - - /* - * Check to see if we found it - */ -#if 0 - hwloc_obj_snprintf(str, sizeof(str), hwloc_topo, *cur_child, "#", 0); - if( nth == num_found ) { - printf("--> FOUND : %-20s \t -- \t %2d of %2d\n", str, nth, num_found); - } - else { - printf("--> MISSING : %-20s \t -- \t %2d of %2d\n", str, nth, num_found); - } -#endif - - if( nth == num_found ) { - return cur_child; - } - else { - free(cur_child); - return NULL; - } -} - -static int rmaps_lama_find_nth_subtree_match_core(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - int *num_found, - hwloc_obj_type_t hwloc_key, - int depth, - hwloc_obj_t *cur_child) -{ - unsigned i; - bool found = false; - -#if 0 - { - char str[128]; - hwloc_obj_snprintf(str, sizeof(str), hwloc_topo, parent_obj, "#", 0); - printf("--> Checking -- %-20s \t -- \t %2d of %2d\n", str, nth, *num_found); - } -#endif - - /* - * Check if the keys match - */ - if( hwloc_key == parent_obj->type ) { - if( HWLOC_OBJ_CACHE == parent_obj->type && - depth == (int)parent_obj->attr->cache.depth ) { - *num_found += 1; - found = true; - } else { - *num_found += 1; - found = true; - } - } - - /* - * Basecase: - * If we have found the correct item, return - */ - if( nth == *num_found ) { - *cur_child = parent_obj; - return ORTE_SUCCESS; - } - - /* - * Do no go any deeper in the tree than we have to - */ - if( !found ) { - for(i = 0; i < parent_obj->arity; ++i ) { - rmaps_lama_find_nth_subtree_match_core(hwloc_topo, - parent_obj->children[i], - nth, - num_found, - hwloc_key, - depth, - cur_child); - if( nth == *num_found ) { - return ORTE_SUCCESS; - } - } - } - - return ORTE_SUCCESS; -} - -hwloc_obj_t * rmaps_lama_find_parent(hwloc_topology_t hwloc_topo, - hwloc_obj_t *child_obj, - rmaps_lama_level_type_t lama_key) -{ - hwloc_obj_t *cur_parent = NULL; - hwloc_obj_type_t hwloc_key; - int depth; - - /* - * Convert LAMA key to HWLOC key - */ - rmaps_lama_convert_lama_key_to_hwloc_key(lama_key, &hwloc_key, &depth); - - /* - * Sanity check - */ - if( hwloc_key == (*child_obj)->type ) { - if( HWLOC_OBJ_CACHE == (*child_obj)->type && - depth == (int)(*child_obj)->attr->cache.depth ) { - return child_obj; - } else { - return child_obj; - } - } - - cur_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - if (NULL == cur_parent) { - return NULL; - } - - /* - * Accend tree to find mathing parent - */ - *cur_parent = (*child_obj)->parent; - while(NULL != *cur_parent ) { - if( hwloc_key == (*cur_parent)->type ) { - if( HWLOC_OBJ_CACHE == (*cur_parent)->type && - depth == (int)(*cur_parent)->attr->cache.depth ) { - return cur_parent; - } else { - return cur_parent; - } - } - - *cur_parent = (*cur_parent)->parent; - } - - free(cur_parent); - return NULL; -} - - -/********************************* - * Max Tree Structure Functions - *********************************/ -opal_tree_t * rmaps_lama_create_empty_max_tree(void) -{ - opal_tree_t *tmp_tree = NULL; - - tmp_tree = OBJ_NEW(opal_tree_t); - opal_tree_init(tmp_tree, - &lama_max_tree_comp, - &lama_max_tree_serialize, - &lama_max_tree_deserialize, - &lama_max_tree_get_key); - - return tmp_tree; -} - -static int lama_max_tree_comp(opal_tree_item_t *item, void *key) -{ - if( ((rmaps_lama_max_tree_item_t *)item)->type == *((rmaps_lama_level_type_t *)key) ) { - return 0; - } - - return -1; -} - -static int lama_max_tree_serialize(opal_tree_item_t *item, opal_buffer_t *buffer) -{ - opal_dss.pack(buffer, &(((rmaps_lama_max_tree_item_t *)item)->type), 1, OPAL_INT); - - return ORTE_SUCCESS; -} - -static int lama_max_tree_deserialize(opal_buffer_t *buffer, opal_tree_item_t **item) -{ - rmaps_lama_max_tree_item_t *element; - orte_std_cntr_t n = 1; - - element = OBJ_NEW(rmaps_lama_max_tree_item_t); - if( OPAL_SUCCESS == opal_dss.unpack(buffer, &(element->type), &n, OPAL_INT) ) { - *item = (opal_tree_item_t*)element; - } else { - *item = NULL; - } - - return ORTE_SUCCESS; -} - -static void * lama_max_tree_get_key(opal_tree_item_t *item) -{ - return &(((rmaps_lama_max_tree_item_t *)item)->type); -} - - -/********************************* - * Pretty Print Functions - *********************************/ -void rmaps_lama_max_tree_pretty_print_tree(opal_tree_t *tree) -{ - if( NULL == tree ) { - return; - } - - if( opal_tree_is_empty(tree) ) { - return; - } - - pretty_print_subtree(tree, opal_tree_get_root(tree), 0); - - return; -} - -static char * rmaps_lama_max_tree_pretty_print_subtree_element_get(opal_tree_t *tree, - opal_tree_item_t *parent, - int level) -{ - char *element_str = NULL; - char *spacer = NULL; - char *label = NULL; - rmaps_lama_level_type_t *type = NULL; - int i; - - if( NULL == parent ) { - return NULL; - } - - spacer = (char *)malloc(sizeof(char) * (level+1)); - for(i = 0; i < level; ++i ) { - spacer[i] = ' '; - } - spacer[level] = '\0'; - - type = (rmaps_lama_level_type_t *)(tree->get_key(parent)); - label = lama_type_enum_to_str(*type); - - asprintf(&element_str, "%s[%s \t : %3d, %3d", - spacer, label, - parent->opal_tree_num_children, parent->opal_tree_num_ancestors); - - free(spacer); - free(label); - - return element_str; -} - -static void pretty_print_subtree(opal_tree_t *tree, opal_tree_item_t *parent, int level) -{ - opal_tree_item_t *child = NULL; - - if( NULL == parent ) { - return; - } - - /* - * Display Self - */ - pretty_print_subtree_element(tree, parent, level); - - /* - * Depth-first display children - * Basecase; If no children - return - */ - level++; - for(child = opal_tree_get_first_child(parent); - child != NULL; - child = opal_tree_get_next_sibling(child) ) { - pretty_print_subtree(tree, child, level); - } - - return; - -} - -static void pretty_print_subtree_element(opal_tree_t *tree, opal_tree_item_t *parent, int level) -{ - char *element_str = NULL; - - if( NULL == parent ) { - return; - } - - element_str = rmaps_lama_max_tree_pretty_print_subtree_element_get(tree, parent, level); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Tree Element: %s", - element_str); - - free(element_str); - - return; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_module.c b/orte/mca/rmaps/lama/rmaps_lama_module.c deleted file mode 100644 index ceb97bf25b1..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_module.c +++ /dev/null @@ -1,1914 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2012-2017 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2014 Intel, Inc. All rights reserved - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#include - -#include "opal/mca/hwloc/hwloc-internal.h" - -#include "opal/util/argv.h" -#include "opal/class/opal_tree.h" - -#include "orte/util/show_help.h" -#include "orte/util/error_strings.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -#include "orte/runtime/orte_globals.h" - -#include "rmaps_lama.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - - -/********************************* - * Module setup - *********************************/ -static int orte_rmaps_lama_map(orte_job_t *jdata); -orte_rmaps_base_module_t orte_rmaps_lama_module = { - orte_rmaps_lama_map -}; - - -/********************************* - * Timer - *********************************/ -#define RMAPS_LAMA_TIMER_TOTAL 0 -#define RMAPS_LAMA_TIMER_PARSE_PARAMS 1 -#define RMAPS_LAMA_TIMER_BUILD_MAX_TREE 2 -#define RMAPS_LAMA_TIMER_MAPPING 3 -#define RMAPS_LAMA_TIMER_ORDERING 4 -#define RMAPS_LAMA_TIMER_MAX 5 - -static double rmaps_lama_get_time(void); -static void rmaps_lama_set_time(int idx, bool is_start); -static void rmaps_lama_display_all_timers(void); -static void rmaps_lama_clear_timers(void); -static void rmaps_lama_display_indv_timer_core(double diff, char *str); - -static double timer_start[RMAPS_LAMA_TIMER_MAX]; -static double timer_end[RMAPS_LAMA_TIMER_MAX]; -static double timer_accum[RMAPS_LAMA_TIMER_MAX]; - -#define RMAPS_LAMA_CLEAR_TIMERS() \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_clear_timers(); \ - } \ - } -#define RMAPS_LAMA_START_TIMER(idx) \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_set_time(idx, true); \ - } \ - } -#define RMAPS_LAMA_END_TIMER(idx) \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_set_time(idx, false); \ - } \ - } -#define RMAPS_LAMA_DISPLAY_TIMERS() \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_display_all_timers(); \ - } \ - } - - -/********************************* - * Structures & Defines - *********************************/ -static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item); -static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item); - -OBJ_CLASS_INSTANCE(rmaps_lama_hwloc_user_t, - opal_object_t, - rmaps_lama_hwloc_user_construct, - rmaps_lama_hwloc_user_destruct); - - -/********************************* - * Globals - *********************************/ -/* - * Mapping - */ -rmaps_lama_level_type_t *lama_mapping_layout = NULL; -static rmaps_lama_level_type_t *lama_mapping_layout_sort = NULL; -int lama_mapping_num_layouts = 0; - -/* - * Binding - */ -rmaps_lama_level_type_t lama_binding_level = LAMA_LEVEL_UNKNOWN; -static int lama_binding_num_levels = 0; - -/* - * MPPR - */ -rmaps_lama_level_info_t *lama_mppr_levels = NULL; -int lama_mppr_num_levels = 0; - -/* - * Ordering - */ -static rmaps_lama_order_type_t lama_ordering = LAMA_ORDER_NATURAL; - -/* - * Homogeneous system optimization - */ -bool lama_mppr_max_tree_homogeneous_system = false; - - -/********************************* - * Support Macros - *********************************/ - - -/********************************* - * Support functions - *********************************/ -/* - * Preprocess the command line arguments - */ -static int orte_rmaps_lama_process_params(orte_job_t *jdata); - -/* - * Mapping Support: - * Core mapping function - */ -static int orte_rmaps_lama_map_core(orte_job_t *jdata); - -/* - * Mapping Support: - * Recursive function for mapping process - */ -static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, - orte_app_context_t *cur_app_context, - opal_list_t *node_list, - orte_node_t **cur_mach_ptr, - opal_tree_t *max_tree, - int cur_level, - int mach_level, - int **pu_idx_ref, - int **last_pu_idx_ref, - int *num_mapped, - int max_procs, - int *iter_passes); - -/* - * Mapping Support: - * Access the next machine in the node list - */ -static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, - opal_list_item_t *cur_mach); - -/* - * Mapping Support: - * Check the availability of the requested slot on the specified node - */ -static int check_node_availability(orte_node_t *cur_node, - opal_tree_t *max_tree, - int *pu_idx_ref, - char **slot_list); - -/* - * Mapping Support: - * Debugging PU display - */ -static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc); -static char * pu_ref_to_str(int *ref, int size); - -/* - * Mapping Support: - * Convert the process layout 'layer' to the sorted position for the PU - */ -static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer); - -/* - * MPPR Support: - * Check to make sure a process can be placed on this resource given the - * MPPR restrictions. - */ -static int rmaps_lama_check_mppr(orte_node_t *node, - hwloc_obj_t *child_obj); -static int rmaps_lama_iter_mppr_parents(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only); -static int rmaps_lama_iter_mppr_children(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only); - -/* - * MPPR Support: - * Increment parents of this child to account for a process being placed - * on this resource. - */ -static int rmaps_lama_inc_mppr(orte_node_t *node, - hwloc_obj_t *child_obj); - -/* - * Mapping Support: - * Return the native representation of the slot list - */ -static char * get_native_slot_list(orte_node_t *cur_node, - hwloc_obj_t *pu_obj, - int *put_idx_ref); - -/* - * Ordering Support: - * Reorder sequentially - */ -static int rmaps_lama_ordering_sequential(orte_job_t *jdata); - -/* - * Map a single process to a specific node - */ -static int orte_rmaps_lama_map_process(orte_job_t *jdata, - orte_node_t *node, - int app_idx, - orte_proc_t **proc); - -/********************************* - * Main Module function to map a job - *********************************/ -static int orte_rmaps_lama_map(orte_job_t *jdata) -{ - int ret, exit_status = ORTE_SUCCESS; - mca_base_component_t *loc_comp = &mca_rmaps_lama_component.base_version; - - RMAPS_LAMA_CLEAR_TIMERS(); - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_TOTAL); - - /* - * Sanity Check: - * If we are not the 'chosen' mapper, then exit here - */ - if (NULL != jdata->map->req_mapper && - 0 != strcasecmp(jdata->map->req_mapper, loc_comp->mca_component_name)) { - /* a mapper has been specified, and it isn't me */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: job %s not using lama mapper (using %s)", - ORTE_JOBID_PRINT(jdata->jobid), - jdata->map->req_mapper); - return ORTE_ERR_TAKE_NEXT_OPTION; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - - /* - * Identify this as the mapper responsible for this job - */ - if (NULL != jdata->map->last_mapper) { - free(jdata->map->last_mapper); - } - jdata->map->last_mapper = strdup(loc_comp->mca_component_name); - - /* - * Start at the beginning... - */ - jdata->num_procs = 0; - - /* - * Process the command line arguments - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); - if( ORTE_SUCCESS != (ret = orte_rmaps_lama_process_params(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); - - /* - * Actually map the job - */ - if( ORTE_SUCCESS != (ret = orte_rmaps_lama_map_core(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * All Done - */ - - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_TOTAL); - RMAPS_LAMA_DISPLAY_TIMERS(); - - - cleanup: - if( NULL != lama_mapping_layout ) { - free(lama_mapping_layout); - lama_mapping_layout = NULL; - } - - if( NULL != lama_mapping_layout_sort ) { - free(lama_mapping_layout_sort); - lama_mapping_layout_sort = NULL; - } - - if( NULL != lama_mppr_levels ) { - free(lama_mppr_levels); - lama_mppr_levels = NULL; - } - - return exit_status; -} - - -/********************************* - * User defined lookup structure for hwloc topology - *********************************/ -static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item) -{ - item->node_mppr = OBJ_NEW(opal_pointer_array_t); - opal_pointer_array_init(item->node_mppr, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE); -} - -static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item) -{ - orte_std_cntr_t i; - - if( NULL != item->node_mppr ) { - for(i = 0; i < item->node_mppr->size; ++i) { - if( NULL != item->node_mppr->addr[i] ) { - OBJ_RELEASE(item->node_mppr->addr[i]); - item->node_mppr->addr[i] = NULL; - } - } - OBJ_RELEASE(item->node_mppr); - item->node_mppr = NULL; - } -} - - -/********************************* - * Command line parameter parsing functions - *********************************/ -static int orte_rmaps_lama_process_params(orte_job_t *jdata) -{ - int ret, i; - char *type_str = NULL; - - /* - * Process map/bind/order/mppr aliases. It will print its own - * error message if something went wrong. - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_process_alias_params(jdata) ) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Parse: Binding. It will print its own error message if - * something goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Binding : [%s]", - rmaps_lama_cmd_bind); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_binding(rmaps_lama_cmd_bind, - &lama_binding_level, - &lama_binding_num_levels)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - type_str = lama_type_enum_to_str(lama_binding_level); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Binding : %*d x %10s", - MAX_BIND_DIGIT_LEN, lama_binding_num_levels, type_str); - free(type_str); - type_str = NULL; - } - /* Reset the binding option since we are going to do it ourselves */ - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - - /* - * Parse: Mapping from Process Layout string. It will print its - * own error message if something goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Mapping : [%s]", - rmaps_lama_cmd_map); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mapping(rmaps_lama_cmd_map, - &lama_mapping_layout, - &lama_mapping_layout_sort, - &lama_mapping_num_layouts)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - type_str = lama_type_enum_to_str(lama_mapping_layout[i]); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Mapping : (%d) %10s (%d vs %d)", - i, type_str, - lama_mapping_layout[i], lama_mapping_layout_sort[i]); - free(type_str); - type_str = NULL; - } - } - - /* - * Parse: MPPR. It will print its own error message if something - * goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- MPPR : [%s]", - rmaps_lama_cmd_mppr); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mppr(rmaps_lama_cmd_mppr, - &lama_mppr_levels, - &lama_mppr_num_levels)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - for( i = 0; i < lama_mppr_num_levels; ++i ) { - type_str = lama_type_enum_to_str(lama_mppr_levels[i].type); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- MPPR : %*d at %10s", - MAX_BIND_DIGIT_LEN, lama_mppr_levels[i].max_resources, type_str); - free(type_str); - type_str = NULL; - } - } - - /* - * Parse: Ordering - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Ordering : [%s]", - rmaps_lama_cmd_ordering); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_ordering(rmaps_lama_cmd_ordering, - &lama_ordering)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - if( LAMA_ORDER_NATURAL == lama_ordering ) { - type_str = strdup("Natural"); - } - else if( LAMA_ORDER_SEQ == lama_ordering ) { - type_str = strdup("Sequential"); - } - else { - type_str = strdup("Unknown"); - } - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Ordering : %10s", - type_str); - free(type_str); - type_str = NULL; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - return ORTE_SUCCESS; -} - - -/********************************* - * Support functions - *********************************/ -rmaps_lama_level_type_t lama_type_str_to_enum(char *param) -{ - if( 0 == strncmp(param, "n", strlen("n")) ) { - return LAMA_LEVEL_MACHINE; - } - else if( 0 == strncmp(param, "b", strlen("b")) ) { - return LAMA_LEVEL_BOARD; - } - else if( 0 == strncmp(param, "s", strlen("s")) ) { - return LAMA_LEVEL_SOCKET; - } - else if( 0 == strncmp(param, "c", strlen("c")) ) { - return LAMA_LEVEL_CORE; - } - else if( 0 == strncmp(param, "h", strlen("h")) ) { - return LAMA_LEVEL_PU; - } - else if( 0 == strncmp(param, "L1", strlen("L1")) ) { - return LAMA_LEVEL_CACHE_L1; - } - else if( 0 == strncmp(param, "L2", strlen("L2")) ) { - return LAMA_LEVEL_CACHE_L2; - } - else if( 0 == strncmp(param, "L3", strlen("L3")) ) { - return LAMA_LEVEL_CACHE_L3; - } - else if( 0 == strncmp(param, "N", strlen("N")) ) { - return LAMA_LEVEL_NUMA; - } - - return LAMA_LEVEL_UNKNOWN; -} - -char * lama_type_enum_to_str(rmaps_lama_level_type_t param) -{ - if( LAMA_LEVEL_MACHINE == param ) { - return strdup("Machine"); - } - else if( LAMA_LEVEL_BOARD == param ) { - return strdup("Board"); - } - else if( LAMA_LEVEL_SOCKET == param ) { - return strdup("Socket"); - } - else if( LAMA_LEVEL_CORE == param ) { - return strdup("Core"); - } - else if( LAMA_LEVEL_PU == param ) { - return strdup("Hw. Thread"); - } - else if( LAMA_LEVEL_CACHE_L1 == param ) { - return strdup("L1 Cache"); - } - else if( LAMA_LEVEL_CACHE_L2 == param ) { - return strdup("L2 Cache"); - } - else if( LAMA_LEVEL_CACHE_L3 == param ) { - return strdup("L3 Cache"); - } - else if( LAMA_LEVEL_NUMA == param ) { - return strdup("NUMA"); - } - - return strdup("Unknown"); -} - -/********************************* - * Core Mapper function - *********************************/ -static int orte_rmaps_lama_map_core(orte_job_t *jdata) -{ - int ret, exit_status = ORTE_SUCCESS; - int cur_app_idx = 0; - int num_slots; - orte_app_context_t *cur_app_context = NULL; - orte_node_t *cur_mach = NULL; - orte_node_t **cur_mach_ptr = NULL; - orte_proc_t *proc = NULL; - opal_list_t *node_list = NULL; - opal_list_item_t *item = NULL; - opal_tree_t *max_tree = NULL; - int *pu_idx_ref = NULL; - int *last_pu_idx_ref = NULL; - int i, num_mapped, last_num_mapped, mach_level = -1; - orte_std_cntr_t j; - int max_procs_to_map; - int iter_passes; - char * last_level_str = NULL; - bool initial_map = true; - - /* - * Setup PU reference - * Find the position of the 'machine' - */ - pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); - if (NULL == pu_idx_ref) { - return ORTE_ERROR; - } - last_pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); - if (NULL == last_pu_idx_ref) { - free(pu_idx_ref); - return ORTE_ERROR; - } - - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - pu_idx_ref[i] = 0; - last_pu_idx_ref[i] = -1; - if( LAMA_LEVEL_MACHINE == lama_mapping_layout[i] ) { - mach_level = i; - } - } - - /* - * Foreach app context - */ - for(cur_app_idx = 0; cur_app_idx < jdata->apps->size; ++cur_app_idx ) { - if( NULL == (cur_app_context = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, cur_app_idx))) { - continue; - } - - /* - * Get the list of nodes for this app_context. - */ - node_list = OBJ_NEW(opal_list_t); - ret = orte_rmaps_base_get_target_nodes(node_list, - &num_slots, - cur_app_context, - jdata->map->mapping, - initial_map, false); - if(ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - /* Flag that all subsequent requests should not reset the node->mapped flag */ - initial_map = false; - - /* - * If a bookmark exists from some prior mapping, then start from there - */ - cur_mach = (orte_node_t*)orte_rmaps_base_get_starting_point(node_list, jdata); - - /* - * If the application did not specify the number of procs - * then set it to the number of 'slots' - * JJH: TODO: Revisit 'max_procs' calculation - */ - if (0 == cur_app_context->num_procs) { - cur_app_context->num_procs = num_slots; - } - max_procs_to_map = cur_app_context->num_procs; - - /* - * Build the Max Tree - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); - max_tree = rmaps_lama_create_empty_max_tree(); - if( ORTE_SUCCESS != (ret = rmaps_lama_build_max_tree(jdata, node_list, - max_tree, - &lama_mppr_max_tree_homogeneous_system)) ) { - exit_status = ret; - goto cleanup; - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); - - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: -----------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_MAPPING); - - /* - * Clear PU reference - */ - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - pu_idx_ref[i] = 0; - } - - /* - * Mapping: Recursively loop over all levels - */ - num_mapped = 0; - last_num_mapped = 0; - iter_passes = 0; - cur_mach_ptr = (orte_node_t**)malloc(sizeof(orte_node_t*)); - *cur_mach_ptr = cur_mach; - while( max_procs_to_map > num_mapped ) { - ret = rmaps_lama_map_core_iter_level(jdata, - cur_app_context, - node_list, - cur_mach_ptr, - max_tree, - lama_mapping_num_layouts-1, - mach_level, - &pu_idx_ref, - &last_pu_idx_ref, - &num_mapped, - max_procs_to_map, - &iter_passes); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * We only get here (without finishing the mapping) if we are going to - * start oversubscribing resources. - */ - if( max_procs_to_map > num_mapped ) { - if( !rmaps_lama_can_oversubscribe ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:oversubscribe", - true, - num_mapped, max_procs_to_map); - exit_status = ORTE_ERROR; - goto cleanup; - } else { - rmaps_lama_am_oversubscribing = true; - } - } - - /* - * Check to see if we have made any progress in the mapping loop - */ - if( 0 < cur_app_idx && 2 == iter_passes ) { - /* - * Give it another pass: - * This is an edge case when we are trying to restart from a - * bookmark left by a previous app context. If this app context - * is starting from exactly the beginning of the allocation - * then the recursive loop could return out here after the - * increment pass. This is indicated by (iter_passes = 2). - * Since no processes were mapped, we just try again. - */ - } - else if( last_num_mapped == num_mapped ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:no-resources-available", - true, - cur_app_idx, - num_mapped, max_procs_to_map, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } else { - last_num_mapped = num_mapped; - } - } - - /* - * Display Bookmark for debugging - */ - last_level_str = pu_ref_to_str(last_pu_idx_ref, lama_mapping_num_layouts); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bookmark: --> Node %10s PU %10s", - jdata->bookmark->name, last_level_str); - free(last_level_str); - last_level_str = NULL; - - /* - * Clenup for next iteration - */ - if( NULL != node_list ) { - while(NULL != (item = opal_list_remove_first(node_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(node_list); - node_list = NULL; - } - - OBJ_RELEASE(max_tree); - max_tree = NULL; - } - - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_MAPPING); - - - /* - * Ordering - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_ORDERING); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - if( LAMA_ORDER_SEQ == lama_ordering ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Sequential ------------"); - - if( ORTE_SUCCESS != (ret = rmaps_lama_ordering_sequential(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Natural ---------------"); -#if 0 - /* - * We compute our own vpids inline with the algorithm. So no need to use the - * orte_rmaps_base_compute_vpids() function. - */ -#endif - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_ORDERING); - - - /* - * Display Mapping - */ - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - char *cpu_bitmap; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - for( j = 0; j < jdata->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - cpu_bitmap = NULL; - orte_get_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, (void**)&cpu_bitmap, OPAL_STRING); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Proc. %2d on Node %10s - Slot %s", - proc->name.vpid, proc->node->name, cpu_bitmap); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } - } - } - - - /* - * All done - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Finished ------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - - cleanup: - if( NULL != node_list ) { - while(NULL != (item = opal_list_remove_first(node_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(node_list); - } - - if( NULL != max_tree ) { - OBJ_RELEASE(max_tree); - } - - free(pu_idx_ref); - free(last_pu_idx_ref); - - if( NULL != last_level_str ) { - free(last_level_str); - } - - return exit_status; -} - -static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, - orte_app_context_t *cur_app_context, - opal_list_t *node_list, - orte_node_t **cur_mach_ptr, - opal_tree_t *max_tree, - int cur_level, - int mach_level, - int **pu_idx_ref, - int **last_pu_idx_ref, - int *num_mapped, - int max_procs, - int *iter_passes) -{ - int ret, exit_status = ORTE_SUCCESS; - int i, j; - opal_tree_item_t *tree_for_level = NULL; - int max_subtree_arity = 0; - char * level_str = NULL; - char * last_level_str = NULL; - char * slot_list = NULL; - orte_proc_t *proc = NULL; - int pu_idx = 0; - - /* - * Find the current tree for this level - * If it is the machine level, then we need to access the information from - * the node list, not the max_tree. - */ - if( cur_level != mach_level ) { - tree_for_level = opal_tree_find_with(opal_tree_get_root(max_tree), - &lama_mapping_layout[cur_level]); - /* - * We do not need subtree, but the arity of the subtree - * JJH TODO: This should be an opal_tree function. - */ - max_subtree_arity = 1; /* include self */ - while( NULL != (tree_for_level = opal_tree_get_next_sibling(tree_for_level)) ) { - ++max_subtree_arity; - } - } - else if( NULL == *cur_mach_ptr ) { - *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); - } - - pu_idx = convert_layer_to_sort_idx(lama_mapping_layout[cur_level]); - level_str = lama_type_enum_to_str(lama_mapping_layout[cur_level]); - - /* - * Do we need to advance to a bookmark - */ - if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - /* - * Display last mapped - */ - last_level_str = pu_ref_to_str(*last_pu_idx_ref, lama_mapping_num_layouts); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bookmark: --> Last Mapped: Node %10s (bkmrk %10s) PU %10s - Level %2d", - (NULL == *cur_mach_ptr ? "(NULL)" : (*cur_mach_ptr)->name), - jdata->bookmark->name, last_level_str, (*last_pu_idx_ref)[pu_idx]); - free(last_level_str); - last_level_str = NULL; - - /* - * Set the level starting point to the last known index - */ - i = (*last_pu_idx_ref)[pu_idx]; - } else { - i = 0; - } - - - /* - * Loop over all siblings at this level - * Initial condition above, Increment at bottom, Break check at bottom - */ - while( 1 ) { - /* - * Define the PU index - */ - (*pu_idx_ref)[pu_idx] = i; - - if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s - Increment only", - cur_level+1, - level_str, pu_idx, i, max_subtree_arity, - (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); - } else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s", - cur_level+1, - level_str, pu_idx, i, max_subtree_arity, - (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); - } - - - /* - * If not the inner most loop, iterate to the next level down - */ - if( cur_level > 0 ) { - ret = rmaps_lama_map_core_iter_level(jdata, - cur_app_context, - node_list, - cur_mach_ptr, - max_tree, - cur_level - 1, - mach_level, - pu_idx_ref, - last_pu_idx_ref, - num_mapped, - max_procs, - iter_passes); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - /* - * If we are restarting the iteration from a previous bookmark then - * the first pass through is a no-op mapping pass that just increments - * the PU reference. - * Called by innermost loop - */ - else if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - *iter_passes += 1; - } - /* - * Try to map at this location - */ - else { - /* - * On first pass, make sure we increment this, just so we do not - * accidentally think this is an increment pass. - */ - if( 0 == *iter_passes ) { - *iter_passes += 1; - } - - /* - * Display the PU ref for debugging - */ - display_pu_ref(*pu_idx_ref, lama_mapping_num_layouts, *num_mapped, proc); - - - /* - * Check to see if this resource is available on this node. - * - * In a heterogeneous or otherwise non-uniformly restricted - * environment we may iterate to a resource that is not - * available either because it does not exist, or is not - * available for allocation (off-lined, sub-node allocation). - * Additionally, we need to check resource constrains expressed - * in the MPPR and binding. - */ - ret = check_node_availability((*cur_mach_ptr), - max_tree, - *pu_idx_ref, - &slot_list); - if( ORTE_SUCCESS != ret || NULL == slot_list ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:hwtopo: Mapping: --> Level %2d: %s - INVALID/SKIP", - cur_level+1, - level_str); - /* - * By not mapping here we just let the iterations continue - * until a suitable match is found or we have exhausted all - * possible locations to match and thus cannot map any more. - */ - } - else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %s - Slot List (%s)", - cur_level+1, - level_str, slot_list); - - /* - * Map this process onto the resource specified - * level_tree_objs[*] and cur_mach point to the specific resource - */ - proc = NULL; - ret = orte_rmaps_lama_map_process(jdata, - (*cur_mach_ptr), - cur_app_context->idx, - &proc); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto bailout; - } - - /* - * Set the binding for this process - */ - orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, slot_list, OPAL_STRING); - /* - * Insert the proc into the 'native' ordering location. - */ - proc->name.vpid = jdata->num_procs; - if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(jdata->procs, - proc->name.vpid, proc))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - jdata->num_procs += 1; - - /* - * Save a bookmark so we can return here later if necessary - */ - for( j = 0; j < lama_mapping_num_layouts; ++j ) { - (*last_pu_idx_ref)[j] = (*pu_idx_ref)[j]; - } - jdata->bookmark = (orte_node_t*)(*cur_mach_ptr); - - (*num_mapped)++; - } - } - - /* - * Increment loop - * - * If we are binding, then we may need to advance the binding layer - * by more than one. - */ - if( cur_level != mach_level ) { - if( lama_binding_level == lama_mapping_layout[cur_level] ) { - i += lama_binding_num_levels; - } else { - ++i; - } - } else { - /* - * Note: Currently we do not allow for 'binding' to multiple machines - * But keep the code just in case we want to play with 'stride' later - */ - if( lama_binding_level == lama_mapping_layout[cur_level] && lama_binding_num_levels > 1) { - opal_output(0, "mca:rmaps:lama: ERROR: Cannot bind to multiple machines - SHOULD NEVER HAPPEN: %s", - rmaps_lama_cmd_bind); - exit_status = ORTE_ERROR; - goto bailout; -#if 0 - for( j = 0; j < lama_binding_num_levels; ++j ) { - cur_mach = get_next_machine(jdata, node_list, (opal_list_item_t*)cur_mach); - if( NULL == cur_mach ) { - break; - } - ++i; - } -#endif - } else { - *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); - ++i; - } - } - - /* - * Check if we are done mapping before iterating again - */ - if( max_procs <= *num_mapped ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Check if we are done looping - */ - if( cur_level != mach_level ) { - if( i >= max_subtree_arity ) { - break; - } - } else { - if( NULL == *cur_mach_ptr ) { - break; - } - } - } - - - /* - * Sanity Check: Check if we are done mapping - */ - if( max_procs <= *num_mapped ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - cleanup: - /* - * If the outermost layer, the increment the number of iteration passes. - */ - if( cur_level == lama_mapping_num_layouts-1 ) { - *iter_passes += 1; - } - - bailout: - if( NULL != level_str ) { - free(level_str); - level_str = NULL; - } - - if( NULL != slot_list ) { - free(slot_list); - slot_list = NULL; - } - - return exit_status; -} - -static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, - opal_list_item_t *cur_mach) -{ - orte_node_t *next_mach = NULL; - - if( NULL == cur_mach ) { - next_mach = (orte_node_t*)opal_list_get_first(node_list); - } - else if( opal_list_get_last(node_list) == cur_mach ) { - next_mach = NULL; - } - else { - next_mach = (orte_node_t*)opal_list_get_next(cur_mach); - } - - return next_mach; -} - -static int orte_rmaps_lama_map_process(orte_job_t *jdata, - orte_node_t *node, - int app_idx, - orte_proc_t **proc) -{ - int ret; - - /* - * Add this node to the map, but only once - */ - if( !ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED) ) { - if (ORTE_SUCCESS > (ret = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { - ORTE_ERROR_LOG(ret); - return ret; - } - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); - OBJ_RETAIN(node); /* maintain accounting on object */ - ++(jdata->map->num_nodes); - } - - /* - * Setup the process object - */ - if (NULL == (*proc = orte_rmaps_base_setup_proc(jdata, node, app_idx))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_ordering_sequential(orte_job_t *jdata) -{ - orte_job_map_t *map; - orte_proc_t *proc = NULL, *swap = NULL; - orte_std_cntr_t i, j; - int cur_rank = 0; - orte_node_t *cur_node = NULL; - - map = jdata->map; - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - /* - * Assign the ranks sequentially - */ - for( i = 0; i < map->nodes->size; ++i) { - if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { - continue; - } - for( j = 0; j < cur_node->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(cur_node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - continue; - } - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Rename Proc. %2d to %2d (Rev. %s)", - proc->name.vpid, cur_rank, proc->node->name); - proc->name.vpid = cur_rank; - ++cur_rank; - } - } - - /* - * Fix the job structure ordering - Sort by new vpid - * - * If we do not do this then the remote daemons assign the incorrect - * ranks to the processes since they use the relative ordering in the - * jdata->procs structure to determine vpids locally. - * - * JJH: Look at combining these loops with the loop in the core so we - * JJH: do not have to iterate over the list two times - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - cur_rank = 0; - for( j = 0; j < jdata->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Proc. %2d on Node %s", - proc->name.vpid, proc->node->name); - - while((int)proc->name.vpid != cur_rank ) { - swap = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid); - - opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); - opal_pointer_array_set_item(jdata->procs, cur_rank, swap); - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: \t SWAP Proc. %2d (%d) and Proc. %2d (%d)", - proc->name.vpid, cur_rank, swap->name.vpid, proc->name.vpid); - proc = swap; - } - ++cur_rank; - } - - return ORTE_SUCCESS; -} - -static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer) -{ - int i; - - for(i = 0; i < lama_mapping_num_layouts; ++i ) { - if( lama_mapping_layout_sort[i] == layer ) { - return i; - } - } - - return 0; -} - -static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc) -{ - char *str = NULL; - - str = pu_ref_to_str(ref, size); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: PU Ref: %s [Rank %2d] Name: %s", - str, rank, - (NULL == proc ? "(null)" : ORTE_NAME_PRINT(&proc->name))); - - free(str); - - return; -} - -static char * pu_ref_to_str(int *ref, int size) -{ - int i, idx; - char *str = NULL; - - str = (char *)malloc(sizeof(char) * (2 * size)); - for(i = 0, idx = 0; i < size; ++i, idx += 2) { - sprintf(&(str[idx]), "%2d", ref[i]); - } - - return str; -} - -static int check_node_availability(orte_node_t *cur_node, - opal_tree_t *max_tree, - int *pu_idx_ref, - char **slot_list) -{ - int exit_status = ORTE_SUCCESS; - int i; - char * level_str = NULL; - hwloc_obj_t *topo_child = NULL, *topo_parent, *topo_allocated; - - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Checking: Node (%s) -------------", - cur_node->name); - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - - /* - * Determine if the current node has the necessary hardware - * as described by the PU index. - * Find the hwloc object reference for the resource pointed to - * by the PU index. - * JJH TODO: If homogeneous system then this could be simplified. - */ - topo_allocated = topo_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - if (NULL == topo_parent) { - return ORTE_ERROR; - } - *topo_parent = hwloc_get_obj_by_depth(cur_node->topology, 0, 0); - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - /* - * Skip 'machine' level - */ - if( LAMA_LEVEL_MACHINE == lama_mapping_layout_sort[i] ) { - continue; - } - /* - * Skip 'board' level - * JJH: HWLOC does not support BOARD at the moment - */ - if( LAMA_LEVEL_BOARD == lama_mapping_layout_sort[i] ) { - continue; - } - - level_str = lama_type_enum_to_str(lama_mapping_layout_sort[i]); - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Checking: %2d of %s", - pu_idx_ref[i], level_str); - - /* - * Find the nth subtree matching the current key - */ - topo_child = rmaps_lama_find_nth_subtree_match(cur_node->topology, - *topo_parent, - pu_idx_ref[i], - lama_mapping_layout_sort[i]); - - /* - * If it does not exist, then this node is not capable of matching - * so it is unavailable. - */ - if( NULL == topo_child ) { - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check failed: Node %s does not have a %10s %2d", - cur_node->name, level_str, pu_idx_ref[i]); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Keep decending the tree - */ - topo_parent = topo_child; - free(level_str); - level_str = NULL; - } - - /* - * We have sufficient hardware :) - */ - - - /* - * Return the native slot list to bind to - * Internally checks the MPPR - */ - *slot_list = get_native_slot_list(cur_node, topo_parent, pu_idx_ref); - if( NULL == *slot_list ) { - goto cleanup; - } - - cleanup: - if( NULL != level_str ) { - free(level_str); - level_str = NULL; - } - - if( ORTE_SUCCESS != exit_status ) { - if( NULL != *slot_list ) { - free(*slot_list); - *slot_list = NULL; - } - } - - free(topo_allocated); - - return exit_status; -} - -static int rmaps_lama_check_mppr(orte_node_t *node, - hwloc_obj_t *child_obj) -{ - int ret; - - /* - * Optimization if no MPPR provided - */ - if( NULL == lama_mppr_levels ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: No MPPR to check - Skip..."); - return ORTE_SUCCESS; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - /* - * Check Parents (excluding self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, true)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - - /* - * Check Children (including self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, true)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_inc_mppr(orte_node_t *node, - hwloc_obj_t *child_obj) -{ - int ret; - - /* - * Optimization if no MPPR provided - */ - if( NULL == lama_mppr_levels ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: No MPPR to increment - Skip..."); - return ORTE_SUCCESS; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - /* - * Increment Parents (excluding self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, false)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - - /* - * Increment Children (including self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, false)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_iter_mppr_parents(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only) -{ - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - char str[128]; - - /* - * Basecase - */ - if( NULL == *child_obj ) { - return ORTE_SUCCESS; - } - - /* - * Check self - */ - /* - * Access MPPR info for this object - */ - hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - - hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: %s: P [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", - (check_only ? "Checking " : "Increment"), - node->index, node->name, str, - mppr_accounting->max, - (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), - (rmaps_lama_am_oversubscribing ? "T" : "F"), - (rmaps_lama_can_oversubscribe ? "T" : "F") ); - - /* - * Check limits - Error on first to exceed - */ - if( check_only ) { - if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { - if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { - return ORTE_ERROR; - } - } - } - /* - * Increment current number allocated below this level - */ - else { - mppr_accounting->cur += 1; - } - - /* - * Go to parent - */ - return rmaps_lama_iter_mppr_parents(node, &((*child_obj)->parent), check_only); -} - -static int rmaps_lama_iter_mppr_children(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only) -{ - int ret; - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - char str[128]; - int i; - - /* - * Check self - */ - /* - * Access MPPR info for this object - */ - hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - - hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: %s: C [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", - (check_only ? "Checking " : "Increment"), - node->index, node->name, str, - mppr_accounting->max, - (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), - (rmaps_lama_am_oversubscribing ? "T" : "F"), - (rmaps_lama_can_oversubscribe ? "T" : "F") ); - - /* - * Check limits - Error on first to exceed - */ - if( check_only ) { - if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { - if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { - return ORTE_ERROR; - } - } - } - /* - * Increment current number allocated below this level - */ - else { - mppr_accounting->cur += 1; - } - - /* - * Check all children - */ - for(i = 0; i < (int)(*child_obj)->arity; ++i ) { - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, &((*child_obj)->children[i]), check_only)) ) { - return ret; - } - } - - return ORTE_SUCCESS; -} - - -static char * get_native_slot_list(orte_node_t *cur_node, hwloc_obj_t *pu_obj, int *put_idx_ref) -{ - int i; - char *slot_list = NULL; - hwloc_obj_t *binding_parent = NULL; - hwloc_obj_t *cur_parent = NULL; - hwloc_cpuset_t binding_cpuset; - hwloc_cpuset_t scratch_cpuset; - char *type_str = NULL; - - /* - * Sanity check - */ - if( NULL == pu_obj ) { - return NULL; - } - - /* - * Determine the cpumask to send to the backend for binding - */ - - /* - * Iterate up the tree until we reach the binding parent - */ - binding_parent = rmaps_lama_find_parent(cur_node->topology, pu_obj, lama_binding_level); - if( NULL == binding_parent ) { - return NULL; - } - - /* - * Iterate across cousins until we find enough resources or hit the node boundary - */ - binding_cpuset = hwloc_bitmap_alloc(); - hwloc_bitmap_zero(binding_cpuset); - - scratch_cpuset = hwloc_bitmap_alloc(); - - cur_parent = binding_parent; - - for(i = 0; i < lama_binding_num_levels; ++i) { - /* - * Check MPPR Availability - */ - if( ORTE_SUCCESS != rmaps_lama_check_mppr(cur_node, cur_parent) ) { - goto cleanup; - } - - /* - * Accumulate the bitmask - * - * JJH: TODO: Add resource offline check (?) - */ - hwloc_bitmap_zero(scratch_cpuset); - /* JJH: Maybe use opal_hwloc_base_get_available_cpus(cur_node->topology, (*cur_parent)) ? - * They do pretty much the same thing, but with more checks... - */ - hwloc_bitmap_and(scratch_cpuset, (*cur_parent)->allowed_cpuset, (*cur_parent)->online_cpuset); - hwloc_bitmap_or(binding_cpuset, scratch_cpuset, binding_cpuset); - -#if 0 - { - hwloc_obj_snprintf(str, sizeof(str), cur_node->topology, *cur_parent, "#", 0); - printf("--> BINDING TO -- %-20s \t -- %2d of %2d -- %2d vs %2d\n",str, - i, lama_binding_level, - (*binding_parent)->logical_index, (*cur_parent)->logical_index); - - hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->allowed_cpuset ); - printf("--> CPU A : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->online_cpuset ); - printf("--> CPU B : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), scratch_cpuset); - printf("--> CPU C : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), binding_cpuset); - printf("--> CPU D : %-20s\n", str); - } -#endif - - /* - * Iterate to the next cousin. - * If we exceed the boundary of the node, then send up an error. - */ - if( (i+1) < lama_binding_num_levels && NULL == (*cur_parent)->next_cousin ) { - type_str = lama_type_enum_to_str(lama_binding_level); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Error: Not able to bind to %*d x %10s - Stopped at %*d", - MAX_BIND_DIGIT_LEN, lama_binding_num_levels, - type_str, - MAX_BIND_DIGIT_LEN, i); - free(type_str); - type_str = NULL; - goto cleanup; - } - /* - * Point to the next cousin - */ - if( NULL != (*cur_parent)->next_cousin ) { - cur_parent = &((*cur_parent)->next_cousin); - } - } - - /* - * Account for the process placement in the MPPR - * Assumes a previous check - * We cannot do this in the loop, since if the MPPR check fails we would - * need to roll back previous increments. - */ - cur_parent = binding_parent; - for(i = 0; i < lama_binding_num_levels; ++i) { - /* - * Account for the process placement in the MPPR - * Assumes a previous check. - */ - if( ORTE_SUCCESS != rmaps_lama_inc_mppr(cur_node, cur_parent) ) { - goto cleanup; - } - - /* - * Point to the next cousin - */ - if( NULL != (*cur_parent)->next_cousin ) { - cur_parent = &((*cur_parent)->next_cousin); - } - } - - /* - * Convert the cpuset to a slot_list for the remote daemon - */ - hwloc_bitmap_list_asprintf(&slot_list, binding_cpuset); - - cleanup: - hwloc_bitmap_free(scratch_cpuset); - hwloc_bitmap_free(binding_cpuset); - free(binding_parent); - - return slot_list; -} - - -/********************************* - * Timer Support - *********************************/ -static double rmaps_lama_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void rmaps_lama_set_time(int idx, bool is_start) -{ - if(idx < RMAPS_LAMA_TIMER_MAX ) { - if( is_start ) { - timer_start[idx] = rmaps_lama_get_time(); - } else { - timer_end[idx] = rmaps_lama_get_time(); - timer_accum[idx] += timer_end[idx] - timer_start[idx]; - } - } -} - -static void rmaps_lama_display_all_timers(void) -{ - double diff = 0.0; - double total = 0.0; - char * label = NULL; - - opal_output(0, - "mca:rmaps:lama: Timing: ---------------------------\n"); - - /* - * Timer: Parse Parameters - */ - label = strdup("Parse Params"); - diff = timer_accum[RMAPS_LAMA_TIMER_PARSE_PARAMS]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Build Max Tree - */ - label = strdup("Build Max Tree"); - diff = timer_accum[RMAPS_LAMA_TIMER_BUILD_MAX_TREE]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Mapping - */ - label = strdup("Mapping"); - diff = timer_accum[RMAPS_LAMA_TIMER_MAPPING]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Ordering - */ - label = strdup("Ordering"); - diff = timer_accum[RMAPS_LAMA_TIMER_ORDERING]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Total Overhead - */ - label = strdup("Other Overhead"); - diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; - rmaps_lama_display_indv_timer_core(diff - total, label); - free(label); - - /* - * Timer: Total - */ - label = strdup("Total"); - diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, - "mca:rmaps:lama: ---------------------------------"); -} - -static void rmaps_lama_clear_timers(void) -{ - int i; - for(i = 0; i < RMAPS_LAMA_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - timer_end[i] = 0.0; - timer_accum[i] = 0.0; - } -} - - -static void rmaps_lama_display_indv_timer_core(double diff, char *str) -{ - double perc = 0; - double total = 0; - - total = timer_end[RMAPS_LAMA_TIMER_TOTAL] - timer_start[RMAPS_LAMA_TIMER_TOTAL]; - perc = (diff/total) * 100; - - opal_output(0, - "mca:rmaps:lama: \t%-20s = %10.2f ms\t%6.2f %s\n", - str, (diff * 1000), perc, "%"); - return; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_params.c b/orte/mca/rmaps/lama/rmaps_lama_params.c deleted file mode 100644 index 6a54b4ba340..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_params.c +++ /dev/null @@ -1,878 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * Processing for command line interface options - * - */ -#include "rmaps_lama.h" - -#include "opal/util/argv.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" -#include "orte/util/show_help.h" - -#include - -/********************************* - * Local Functions - *********************************/ -/* - * QSort: Integer comparison - */ -static int lama_parse_int_sort(const void *a, const void *b); - -/* - * Convert the '-ppr' syntax from the 'ppr' component to the 'lama' '-mppr' syntax. - */ -static char * rmaps_lama_covert_ppr(char * given_ppr); - -/********************************* - * Parsing Functions - *********************************/ -int rmaps_lama_process_alias_params(orte_job_t *jdata) -{ - int exit_status = ORTE_SUCCESS; - - /* - * Mapping options - * Note: L1, L2, L3 are not exposed in orterun to the user, so - * there is no need to specify them here. - */ - if( NULL == rmaps_lama_cmd_map ) { - /* orte_rmaps_base.mapping */ - switch( ORTE_GET_MAPPING_POLICY(jdata->map->mapping) ) { - case ORTE_MAPPING_BYNODE: - /* rmaps_lama_cmd_map = strdup("nbNsL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("nbsch"); - break; - case ORTE_MAPPING_BYBOARD: - /* rmaps_lama_cmd_map = strdup("bnNsL3L2L1ch"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "by board", "mapping by board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - case ORTE_MAPPING_BYNUMA: - /* rmaps_lama_cmd_map = strdup("NbnsL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("Nbnsch"); - break; - case ORTE_MAPPING_BYSOCKET: - /* rmaps_lama_cmd_map = strdup("sNbnL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("sbnch"); - break; - case ORTE_MAPPING_BYL3CACHE: - rmaps_lama_cmd_map = strdup("L3sNbnL2L1ch"); - break; - case ORTE_MAPPING_BYL2CACHE: - rmaps_lama_cmd_map = strdup("L2sNbnL1ch"); - break; - case ORTE_MAPPING_BYL1CACHE: - rmaps_lama_cmd_map = strdup("L1sNbnch"); - break; - case ORTE_MAPPING_BYCORE: - case ORTE_MAPPING_BYSLOT: - /* rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); */ - rmaps_lama_cmd_map = strdup("csbnh"); - break; - case ORTE_MAPPING_BYHWTHREAD: - /* rmaps_lama_cmd_map = strdup("hcL1L2L3sNbn"); */ - rmaps_lama_cmd_map = strdup("hcsbn"); - break; - case ORTE_MAPPING_RR: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "round robin", "mapping by round robin not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - case ORTE_MAPPING_SEQ: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "sequential", "mapping by sequential not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - case ORTE_MAPPING_BYUSER: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "by user", "mapping by user not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - default: - /* - * Default is map-by core - */ - rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); - break; - } - } - - /* - * Binding Options - */ - if( NULL == rmaps_lama_cmd_bind ) { - /* - * No binding specified, use default - */ - if( !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || - !OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || - OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { - rmaps_lama_cmd_bind = NULL; - } - - switch( OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { - case OPAL_BIND_TO_BOARD: - /* rmaps_lama_cmd_bind = strdup("1b"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - "by board", "binding to board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - case OPAL_BIND_TO_NUMA: - rmaps_lama_cmd_bind = strdup("1N"); - break; - case OPAL_BIND_TO_SOCKET: - rmaps_lama_cmd_bind = strdup("1s"); - break; - case OPAL_BIND_TO_L3CACHE: - rmaps_lama_cmd_bind = strdup("1L3"); - break; - case OPAL_BIND_TO_L2CACHE: - rmaps_lama_cmd_bind = strdup("1L2"); - break; - case OPAL_BIND_TO_L1CACHE: - rmaps_lama_cmd_bind = strdup("1L1"); - break; - case OPAL_BIND_TO_CORE: - rmaps_lama_cmd_bind = strdup("1c"); - break; - case OPAL_BIND_TO_HWTHREAD: - rmaps_lama_cmd_bind = strdup("1h"); - break; - case OPAL_BIND_TO_CPUSET: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - "by CPU set", "binding to CPU set not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - default: - rmaps_lama_cmd_bind = NULL; - break; - } - } - - /* - * Ordering (a.k.a. Ranking) Options - */ - if( NULL == rmaps_lama_cmd_ordering ) { - /* orte_rmaps_base.ranking */ - switch( ORTE_GET_RANKING_POLICY(jdata->map->ranking) ) { - case ORTE_RANK_BY_SLOT: - rmaps_lama_cmd_ordering = strdup("s"); - break; - case ORTE_RANK_BY_NODE: - case ORTE_RANK_BY_NUMA: - case ORTE_RANK_BY_SOCKET: - case ORTE_RANK_BY_L3CACHE: - case ORTE_RANK_BY_L2CACHE: - case ORTE_RANK_BY_L1CACHE: - case ORTE_RANK_BY_CORE: - case ORTE_RANK_BY_HWTHREAD: - rmaps_lama_cmd_ordering = strdup("n"); - break; - case ORTE_RANK_BY_BOARD: - /* rmaps_lama_cmd_ordering = strdup("n"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid ordering option", - true, - "by board", "ordering by board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - default: - rmaps_lama_cmd_ordering = strdup("n"); - break; - } - } - - /* - * MPPR - */ - if( NULL == rmaps_lama_cmd_mppr ) { - /* - * The ppr is given in the map - */ - if( NULL != jdata->map->ppr) { - rmaps_lama_cmd_mppr = rmaps_lama_covert_ppr(jdata->map->ppr); - } - } - - /* - * Oversubscription - */ - if( ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping) ) { - rmaps_lama_can_oversubscribe = false; - } - else { - rmaps_lama_can_oversubscribe = true; - } - - /* - * Display revised values - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Revised Parameters -----"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Map : %s", - rmaps_lama_cmd_map); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bind : %s", - rmaps_lama_cmd_bind); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: MPPR : %s", - rmaps_lama_cmd_mppr); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Order : %s", - rmaps_lama_cmd_ordering); - - cleanup: - return exit_status; -} - -static char * rmaps_lama_covert_ppr(char * given_ppr) -{ - return strdup(given_ppr); -} - -int rmaps_lama_parse_mapping(char *layout, - rmaps_lama_level_type_t **layout_types, - rmaps_lama_level_type_t **layout_types_sorted, - int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - int i, j, len; - bool found_req_param_n = false; - bool found_req_param_h = false; - bool found_req_param_bind = false; - - /* - * Sanity Check: - * There is no default layout, so if we get here and nothing is specified - * then this is an error. - */ - if( NULL == layout ) { - orte_show_help("help-orte-rmaps-lama.txt", - "internal error", - true, - "rmaps_lama_parse_mapping", - "internal error 1"); - return ORTE_ERROR; - } - - *num_types = 0; - - /* - * Extract and convert all the keys - */ - len = strlen(layout); - for(i = 0; i < len; ++i) { - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( layout[i] == 'L' ) { - param[0] = layout[i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, "cache level missing number"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = layout[i]; - param[2] = '\0'; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = layout[i]; - param[1] = '\0'; - } - - /* - * Append level - */ - *num_types += 1; - *layout_types = (rmaps_lama_level_type_t*)realloc(*layout_types, sizeof(rmaps_lama_level_type_t) * (*num_types)); - (*layout_types)[(*num_types)-1] = lama_type_str_to_enum(param); - } - - /* - * Check for duplicates and unknowns - * Copy to sorted list - */ - *layout_types_sorted = (rmaps_lama_level_type_t*)malloc(sizeof(rmaps_lama_level_type_t) * (*num_types)); - for( i = 0; i < *num_types; ++i ) { - /* - * Copy for later sorting - */ - (*layout_types_sorted)[i] = (*layout_types)[i]; - - /* - * Look for unknown and unsupported options - */ - if( LAMA_LEVEL_UNKNOWN <= (*layout_types)[i] ) { - char *msg; - asprintf(&msg, "unknown mapping level at position %d", i + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - if( LAMA_LEVEL_MACHINE == (*layout_types)[i] ) { - found_req_param_n = true; - } - - if( LAMA_LEVEL_PU == (*layout_types)[i] ) { - found_req_param_h = true; - } - - if( lama_binding_level == (*layout_types)[i] ) { - found_req_param_bind = true; - } - - /* - * Look for duplicates - */ - for( j = i+1; j < *num_types; ++j ) { - if( (*layout_types)[i] == (*layout_types)[j] ) { - char *msg; - asprintf(&msg, "duplicate mapping levels at position %d and %d", - i + 1, j + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - } - - /* - * The user is required to specify at least the: - * - machine - * - hardware thread (needed for lower bound binding) JJH: We should be able to lift this... - * - binding layer (need it to stride the mapping) - * Only print the error message once, for brevity. - */ - if( !found_req_param_n ) { - char *msg; - asprintf(&msg, "missing required 'n' mapping token"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - else if(!found_req_param_h) { - char *msg; - asprintf(&msg, "missing required 'h' mapping token"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } else if (!found_req_param_bind) { - char *msg; - asprintf(&msg, "missing required mapping token for the current binding level"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Sort the items - */ - qsort((*layout_types_sorted ), (*num_types), sizeof(int), lama_parse_int_sort); - - cleanup: - return exit_status; -} - -int rmaps_lama_parse_binding(char *layout, rmaps_lama_level_type_t *binding_level, int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - char num[MAX_BIND_DIGIT_LEN]; - int i, n, p, len; - - /* - * Default: If nothing specified - * - Bind to machine - */ - if( NULL == layout ) { - *binding_level = LAMA_LEVEL_MACHINE; - *num_types = 1; - return ORTE_SUCCESS; - } - - *num_types = 0; - - /* - * Extract and convert all the keys - */ - len = strlen(layout); - n = 0; - p = 0; - for(i = 0; i < len; ++i) { - /* - * Must start with a digit - */ - if( isdigit(layout[i]) ) { - /* - * Check: Digits must come first - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "missing digit(s) before binding level token"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - num[n] = layout[i]; - ++n; - /* - * Check: Exceed bound of number of digits - */ - if( n >= MAX_BIND_DIGIT_LEN ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "too many digits"); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - /* - * Extract the level - */ - else { - /* - * Check: Digits must come first - */ - if( n == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "missing digit(s) before binding level token"); - exit_status = ORTE_ERROR; - goto cleanup; - } - /* - * Check: Only one level allowed - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "only one binding level may be specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( layout[i] == 'L' ) { - param[0] = layout[i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "only one binding level may be specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = layout[i]; - p = 2; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = layout[i]; - p = 1; - } - param[p] = '\0'; - } - } - /* - * Check that the level was specified - */ - if( p == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "binding specification is empty"); - exit_status = ORTE_ERROR; - goto cleanup; - } - num[n] = '\0'; - - *binding_level = lama_type_str_to_enum(param); - *num_types = atoi(num); - - /* - * Check for unknown level - */ - if( LAMA_LEVEL_UNKNOWN <= *binding_level ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "unknown binding level"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - cleanup: - return exit_status; -} - -int rmaps_lama_parse_mppr(char *layout, rmaps_lama_level_info_t **mppr_levels, int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - char num[MAX_BIND_DIGIT_LEN]; - char **argv = NULL; - int argc = 0; - int i, j, len; - int p, n; - - /* - * Default: Unrestricted allocation - * 'oversubscribe' flag accounted for elsewhere - */ - if( NULL == layout ) { - *mppr_levels = NULL; - *num_types = 0; - return ORTE_SUCCESS; - } - - *num_types = 0; - - /* - * Split by ',' - * <#:level>,<#:level>,... - */ - argv = opal_argv_split(layout, ','); - argc = opal_argv_count(argv); - for(j = 0; j < argc; ++j) { - /* - * Parse <#:level> - */ - len = strlen(argv[j]); - n = 0; - p = 0; - for(i = 0; i < len; ++i) { - /* - * Skip the ':' separator and whitespace - */ - if( argv[j][i] == ':' || isblank(argv[j][i])) { - continue; - } - /* - * Must start with a digit - */ - else if( isdigit(argv[j][i]) ) { - /* - * Check: Digits must come first - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "missing digit(s) before resource specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - num[n] = argv[j][i]; - ++n; - /* - * Check: Exceed bound of number of digits - */ - if( n >= MAX_BIND_DIGIT_LEN ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "too many digits"); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - /* - * Extract the level - */ - else { - /* - * Check: Digits must come first - */ - if( n == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "missing digit(s) before resource specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - /* - * Check: Only one level allowed - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "only one resource type may be listed per specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( argv[j][i] == 'L' ) { - param[0] = argv[j][i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "cache level missing number"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = argv[j][i]; - p = 2; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = argv[j][i]; - p = 1; - } - param[p] = '\0'; - } - } - - /* - * Whitespace, just skip - */ - if( n == 0 && p == 0 ) { - continue; - } - - /* - * Check that the level was specified - */ - if( p == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "resource type not specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - num[n] = '\0'; - - /* - * Append level - */ - *num_types += 1; - *mppr_levels = (rmaps_lama_level_info_t*)realloc(*mppr_levels, sizeof(rmaps_lama_level_info_t) * (*num_types)); - (*mppr_levels)[(*num_types)-1].type = lama_type_str_to_enum(param); - (*mppr_levels)[(*num_types)-1].max_resources = atoi(num); - - } - - /* - * Check for duplicates and unknowns - */ - for( i = 0; i < *num_types; ++i ) { - /* - * Look for unknown and unsupported options - */ - if( LAMA_LEVEL_UNKNOWN <= (*mppr_levels)[i].type ) { - char *msg; - asprintf(&msg, "unknown resource type at position %d", i + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Look for duplicates - */ - for( j = i+1; j < *num_types; ++j ) { - if( (*mppr_levels)[i].type == (*mppr_levels)[j].type ) { - char *msg; - asprintf(&msg, "duplicate resource tpyes at position %d and %d", - i + 1, j + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - } - - cleanup: - if( NULL != argv ) { - opal_argv_free(argv); - argv = NULL; - } - - return exit_status; -} - -int rmaps_lama_parse_ordering(char *layout, - rmaps_lama_order_type_t *order) -{ - /* - * Default: Natural ordering - */ - if( NULL == layout ) { - *order = LAMA_ORDER_NATURAL; - return ORTE_SUCCESS; - } - - /* - * Sequential Ordering - */ - if( 0 == strncmp(layout, "s", strlen("s")) || - 0 == strncmp(layout, "S", strlen("S")) ) { - *order = LAMA_ORDER_SEQ; - } - /* - * Natural Ordering - */ - else if( 0 == strncmp(layout, "n", strlen("n")) || - 0 == strncmp(layout, "N", strlen("N")) ) { - *order = LAMA_ORDER_NATURAL; - } - /* - * Check for unknown options - */ - else { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid ordering option", - true, - "unsupported ordering option", layout); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -bool rmaps_lama_ok_to_prune_level(rmaps_lama_level_type_t level) -{ - int i; - - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - if( level == lama_mapping_layout[i] ) { - return false; - } - } - - return true; -} - -/********************************* - * Support Functions - *********************************/ -static int lama_parse_int_sort(const void *a, const void *b) { - int left = *((int*)a); - int right = *((int*)b); - - if( left < right ) { - return -1; - } - else if( left > right ) { - return 1; - } - else { - return 0; - } -} diff --git a/orte/mca/rmaps/mindist/rmaps_mindist_module.c b/orte/mca/rmaps/mindist/rmaps_mindist_module.c index 53ce91f71ae..29d5e7813b5 100644 --- a/orte/mca/rmaps/mindist/rmaps_mindist_module.c +++ b/orte/mca/rmaps/mindist/rmaps_mindist_module.c @@ -45,7 +45,7 @@ static int mindist_map(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_mindist_module = { - mindist_map + .map_job = mindist_map }; /* @@ -391,15 +391,6 @@ static int mindist_map(orte_job_t *jdata) } } - /* compute vpids and add proc objects to the job - do this after - * each app_context so that the ranks within each context are - * contiguous - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly @@ -415,6 +406,17 @@ static int mindist_map(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } free(orte_rmaps_base.device); + /* compute vpids and add proc objects to the job - do this after + * each app_context so that the ranks within each context are + * contiguous + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* mark the job as fully described */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); return ORTE_SUCCESS; error: @@ -425,3 +427,96 @@ static int mindist_map(orte_job_t *jdata) return rc; } + +#if 0 +static int assign_locations(orte_job_t *jdata) +{ + int j, k, m, n, npus; + orte_app_context_t *app; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + mca_base_component_t *c = &mca_rmaps_mindist_component.base_version; + int rc; + opal_list_t numa_list; + opal_rmaps_numa_node_t *numa; + + if (NULL == jdata->map->last_mapper|| + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* the mapper should have been set to me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:mindist: job %s not using mindist mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:mindist: assign locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* start assigning procs to objects, filling each object as we go until + * all procs are assigned. If one pass doesn't catch all the required procs, + * then loop thru the list again to handle the oversubscription + */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + + /* first we need to fill summary object for root with information about nodes + * so we call opal_hwloc_base_get_nbobjs_by_type */ + opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); + OBJ_CONSTRUCT(&numa_list, opal_list_t); + rc = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list); + if (rc > 1) { + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices", + true, orte_rmaps_base.device, rc, node->name); + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_TAKE_NEXT_OPTION; + } else if (rc < 0) { + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found", + true, orte_rmaps_base.device, node->name); + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + j = 0; + OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) { + /* get the hwloc object for this numa */ + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_NOT_FOUND; + } + npus = opal_hwloc_base_get_npus(node->topology->topo, obj); + /* fill the numa region with procs from this job until we either + * have assigned everyone or the region is full */ + for (k = j; k < node->procs->size && 0 < npus; k++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + ++j; + --npus; + } + } + OPAL_LIST_DESTRUCT(&numa_list); + } + } + + return ORTE_SUCCESS; +} +#endif diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 35285e95cda..41523de3b6b 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -33,9 +33,11 @@ #include "rmaps_ppr.h" static int ppr_mapper(orte_job_t *jdata); +static int assign_locations(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_ppr_module = { - ppr_mapper + .map_job = ppr_mapper, + .assign_locations = assign_locations }; /* RHC: will eventually remove this @@ -391,11 +393,6 @@ static int ppr_mapper(orte_job_t *jdata) rc = ORTE_ERR_SILENT; goto error; } - /* compute vpids and add proc objects to the job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - goto error; - } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done @@ -623,3 +620,122 @@ static void prune(orte_jobid_t jobid, error: opal_output(0, "INFINITE LOOP"); } + +static int assign_locations(orte_job_t *jdata) +{ + int i, j, m, n; + mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; + orte_node_t *node; + orte_proc_t *proc; + orte_app_context_t *app; + opal_hwloc_level_t level; + hwloc_obj_t obj; + unsigned int cache_level=0; + int ppr, cnt, nobjs, nprocs_mapped; + char **ppr_req, **ck; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:ppr: job %s not using ppr assign: %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s", + ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr, + orte_rmaps_base_print_mapping(jdata->map->mapping)); + + /* pickup the object level */ + if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_NODE_LEVEL; + } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_HWTHREAD_LEVEL; + } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_CORE_LEVEL; + } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_SOCKET_LEVEL; + } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L1CACHE_LEVEL; + cache_level = 1; + } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L2CACHE_LEVEL; + cache_level = 2; + } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L3CACHE_LEVEL; + cache_level = 3; + } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_NUMA_LEVEL; + } else { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + /* get the ppr value */ + ppr_req = opal_argv_split(jdata->map->ppr, ','); + ck = opal_argv_split(ppr_req[0], ':'); + ppr = strtol(ck[0], NULL, 10); + opal_argv_free(ck); + opal_argv_free(ppr_req); + + /* start assigning procs to objects, filling each object as we go until + * all procs are assigned. */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + nprocs_mapped = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + if (OPAL_HWLOC_NODE_LEVEL == level) { + obj = hwloc_get_root_obj(node->topology->topo); + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } else { + /* get the number of resources on this node at this level */ + nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, + level, cache_level, + OPAL_HWLOC_AVAILABLE); + + /* map the specified number of procs to each such resource on this node, + * recording the locale of each proc so we know its cpuset + */ + cnt = 0; + for (i=0; i < nobjs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, + level, cache_level, + i, OPAL_HWLOC_AVAILABLE); + for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + nprocs_mapped++; + cnt++; + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + } + } + } + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 26d19f6881e..ee8651d5b2b 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -51,6 +51,13 @@ #include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h" #include "orte/runtime/orte_globals.h" +static int orte_rmaps_rf_map(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_rank_file_module = { + .map_job = orte_rmaps_rf_map +}; + + static int orte_rmaps_rank_file_parse(const char *); static char *orte_rmaps_rank_file_parse_string_or_int(void); static const char *orte_rmaps_rank_file_name_cur = NULL; @@ -363,6 +370,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) } } OBJ_DESTRUCT(&rankmap); + /* mark the job as fully described */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + return rc; error: @@ -371,11 +381,6 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) return rc; } -orte_rmaps_base_module_t orte_rmaps_rank_file_module = { -orte_rmaps_rf_map -}; - - static int orte_rmaps_rank_file_parse(const char *rankfile) { int token; diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index afc4576737b..3ead4d31305 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -5,7 +5,7 @@ * Corporation. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -36,6 +36,14 @@ #include "orte/mca/rmaps/base/base.h" #include "rmaps_resilient.h" +static int orte_rmaps_resilient_map(orte_job_t *jdata); +static int resilient_assign(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_resilient_module = { + .map_job = orte_rmaps_resilient_map, + .assign_locations = resilient_assign +}; + /* * Local variable @@ -270,9 +278,22 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) return rc; } -orte_rmaps_base_module_t orte_rmaps_resilient_module = { - orte_rmaps_resilient_map -}; +static int resilient_assign(orte_job_t *jdata) +{ + mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:resilient: job %s not using resilient assign: %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + return ORTE_ERR_NOT_IMPLEMENTED; +} static char *orte_getline(FILE *fp) { @@ -855,15 +876,6 @@ static int map_to_ftgrps(orte_job_t *jdata) /* track number of procs */ jdata->num_procs += app->num_procs; - /* compute vpids and add proc objects to the job - this has to be - * done after each app_context is mapped in order to keep the - * vpids contiguous within an app_context - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ @@ -873,11 +885,5 @@ static int map_to_ftgrps(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } - /* compute and save local ranks */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/rmaps.h b/orte/mca/rmaps/rmaps.h index 9670c7ac2c8..4faaf2e2cb8 100644 --- a/orte/mca/rmaps/rmaps.h +++ b/orte/mca/rmaps/rmaps.h @@ -60,23 +60,30 @@ BEGIN_C_DECLS * rmaps module functions */ -/* mapping event - the event one activates to schedule mapping - * of procs to nodes for pending jobs - */ -ORTE_DECLSPEC extern opal_event_t orte_mapping_event; - /** * RMAPS module functions - these are not accessible to the outside world, * but are defined here by convention */ + +/* map a job - used by the HNP to compute the #procs on each node. + * This is passed to the backend daemons as a regex which they + * use to create an orte_job_map_t for the job */ typedef int (*orte_rmaps_base_module_map_fn_t)(orte_job_t *jdata); +/* assign a location to each process. Used by the backend daemons, + * this function takes the orte_job_map_t created from the regex + * and assigns each process to a specific location within the + * hardware topology based on the --map-by directive */ +typedef int (*orte_rmaps_base_module_assign_loc_fn_t)(orte_job_t *jdata); + /* * rmaps module version 3.0.0 */ struct orte_rmaps_base_module_3_0_0_t { /** Mapping function pointer */ orte_rmaps_base_module_map_fn_t map_job; + /* assign locations */ + orte_rmaps_base_module_assign_loc_fn_t assign_locations; }; /** Convenience typedef */ typedef struct orte_rmaps_base_module_3_0_0_t orte_rmaps_base_module_3_0_0_t; diff --git a/orte/mca/rmaps/round_robin/Makefile.am b/orte/mca/rmaps/round_robin/Makefile.am index 1f19dcc7657..bd51a226429 100644 --- a/orte/mca/rmaps/round_robin/Makefile.am +++ b/orte/mca/rmaps/round_robin/Makefile.am @@ -10,6 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -23,7 +24,8 @@ sources = \ rmaps_rr.c \ rmaps_rr.h \ rmaps_rr_component.c \ - rmaps_rr_mappers.c + rmaps_rr_mappers.c \ + rmaps_rr_assign.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index a764e0243f3..b268c4953e7 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -243,15 +243,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) goto error; } - /* compute vpids and add proc objects to the job - do this after - * each app_context so that the ranks within each context are - * contiguous - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly @@ -278,6 +269,113 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return rc; } +static int orte_rmaps_rr_assign_locations(orte_job_t *jdata) +{ + mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; + int rc; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: job %s not using rr mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assign locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* if the mapping directive was byslot or bynode, then we + * assign locations to the root object level */ + if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping) || + ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + return orte_rmaps_rr_assign_root_level(jdata); + } + + /* otherwise, assign by object */ + if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_PU, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't assign by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CORE, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 1); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 2); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 3); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_SOCKET, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_NODE, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else { + /* unrecognized mapping directive */ + orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", + true, "mapping", + orte_rmaps_base_print_mapping(jdata->map->mapping)); + rc = ORTE_ERR_SILENT; + } + return rc; +} + orte_rmaps_base_module_t orte_rmaps_round_robin_module = { - orte_rmaps_rr_map + .map_job = orte_rmaps_rr_map, + .assign_locations = orte_rmaps_rr_assign_locations }; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.h b/orte/mca/rmaps/round_robin/rmaps_rr.h index 6591a3b6c20..4d998bbbba1 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.h +++ b/orte/mca/rmaps/round_robin/rmaps_rr.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * @@ -54,6 +54,13 @@ ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level); +ORTE_MODULE_DECLSPEC int orte_rmaps_rr_assign_root_level(orte_job_t *jdata); + +ORTE_MODULE_DECLSPEC int orte_rmaps_rr_assign_byobj(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level); + + END_C_DECLS #endif diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_assign.c b/orte/mca/rmaps/round_robin/rmaps_rr_assign.c new file mode 100644 index 00000000000..81fa0b67b08 --- /dev/null +++ b/orte/mca/rmaps/round_robin/rmaps_rr_assign.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "opal/util/output.h" +#include "opal/mca/hwloc/base/base.h" + +#include "orte/util/show_help.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/rmaps_private.h" +#include "orte/mca/rmaps/base/base.h" +#include "rmaps_rr.h" + +int orte_rmaps_rr_assign_root_level(orte_job_t *jdata) +{ + int i, m; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning procs to root level for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:slot working node %s", + node->name); + /* get the root object as we are not assigning + * locale here except at the node level */ + if (NULL == node->topology || NULL == node->topology->topo) { + /* nothing we can do */ + continue; + } + obj = hwloc_get_root_obj(node->topology->topo); + for (i=0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:assign skipping proc %s - from another job", + ORTE_NAME_PRINT(&proc->name)); + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + return ORTE_SUCCESS; +} + +/* mapping by hwloc object looks a lot like mapping by node, + * but has the added complication of possibly having different + * numbers of objects on each node + */ +int orte_rmaps_rr_assign_byobj(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level) +{ + int start, j, m, n; + orte_app_context_t *app; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + unsigned int nobjs; + + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning locations by %s for job %s", + hwloc_obj_type_string(target), + ORTE_JOBID_PRINT(jdata->jobid)); + + + /* start mapping procs onto objects, filling each object as we go until + * all procs are mapped. If one pass doesn't catch all the required procs, + * then loop thru the list again to handle the oversubscription + */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + /* get the number of objects of this type on this node */ + nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); + if (0 == nobjs) { + continue; + } + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: found %u %s objects on node %s", + nobjs, hwloc_obj_type_string(target), node->name); + + /* if this is a comm_spawn situation, start with the object + * where the parent left off and increment */ + if (ORTE_JOBID_INVALID != jdata->originator.jobid) { + start = (jdata->bkmark_obj + 1) % nobjs; + } else { + start = 0; + } + /* loop over the procs on this node */ + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:assign skipping proc %s - from another job", + ORTE_NAME_PRINT(&proc->name)); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + opal_output_verbose(20, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning proc to object %d", (j + start) % nobjs); + /* get the hwloc object */ + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (j + start) % nobjs, OPAL_HWLOC_AVAILABLE))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { + orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, + orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), + orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); + return ORTE_ERR_SILENT; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index 623a2184f59..9bbe2253964 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -54,7 +54,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata); /* define the module */ orte_rmaps_base_module_t orte_rmaps_seq_module = { - orte_rmaps_seq_map + .map_job = orte_rmaps_seq_map }; /* local object for tracking rank locations */ @@ -517,6 +517,10 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) } } + /* mark that this job is to be fully + * described in the launch msg */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + return ORTE_SUCCESS; error: diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 69cfa8945a8..38c27ba08a2 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -899,8 +899,6 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); - /* flag that the node is no longer in a map */ - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c index 6fcecd26bee..d095813594f 100644 --- a/orte/mca/state/dvm/state_dvm.c +++ b/orte/mca/state/dvm/state_dvm.c @@ -255,7 +255,7 @@ static void vm_ready(int fd, short args, void *cbdata) /* if we couldn't provide the allocation regex on the orted * cmd line, then we need to provide all the info here */ if (!orte_nidmap_communicated) { - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c index c18c4a0e01a..cfde6135390 100644 --- a/orte/mca/state/hnp/state_hnp.c +++ b/orte/mca/state/hnp/state_hnp.c @@ -73,6 +73,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -91,6 +93,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, orte_plm_base_vm_ready, + orte_rmaps_base_map_job, + orte_plm_base_mapping_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, diff --git a/orte/mca/state/novm/state_novm.c b/orte/mca/state/novm/state_novm.c index 512f6cc43dd..72d7c0bd397 100644 --- a/orte/mca/state/novm/state_novm.c +++ b/orte/mca/state/novm/state_novm.c @@ -61,6 +61,7 @@ orte_state_base_module_t orte_state_novm_module = { }; static void allocation_complete(int fd, short args, void *cbdata); +static void map_complete(int fd, short args, void *cbdata); static void vm_ready(int fd, short args, void *cbdata); /* defined state machine sequence for no VM - individual @@ -74,6 +75,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -93,6 +96,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, vm_ready, + orte_rmaps_base_map_job, + map_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, @@ -195,7 +200,7 @@ static void allocation_complete(int fd, short args, void *cbdata) orte_job_t *daemons; orte_topology_t *t; orte_node_t *node; - int i, rc; + int i; jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; @@ -235,21 +240,27 @@ static void allocation_complete(int fd, short args, void *cbdata) } } - /* perform the map */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) { - ORTE_ERROR_LOG(rc); - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto done; - } - - /* after we map, we are ready to launch the daemons */ - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + /* move to the map stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); done: /* cleanup */ OBJ_RELEASE(state); } +/* after we map, we are ready to launch the daemons */ +static void map_complete(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = state->jdata; + + jdata->state = ORTE_JOB_STATE_MAP_COMPLETE; + /* move to the map stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + + /* cleanup */ + OBJ_RELEASE(state); +} static void vm_ready(int fd, short args, void *cbdata) { diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 4dcb9cfb755..6b3e5bde785 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -99,6 +99,10 @@ int pmix_server_publish_fn(opal_process_name_t *proc, opal_pmix_persistence_t persist = OPAL_PMIX_PERSIST_APP; bool rset, pset; + opal_output_verbose(1, orte_pmix_server_globals.output, + "%s orted:pmix:server PUBLISH", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); (void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__); @@ -259,6 +263,10 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, /* pack the keys too */ for (i=0; i < nkeys; i++) { + opal_output_verbose(5, orte_pmix_server_globals.output, + "%s lookup data %s for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i], + ORTE_NAME_PRINT(proc)); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &keys[i], 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index c5914169198..04e434645f6 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -89,6 +89,53 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } + /* pack the attributes that need to be sent */ + count = 0; + OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + ++count; + } + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* check for job info attribute */ + cache = NULL; + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) && + NULL != cache) { + /* we need to pack these as well, but they are composed + * of opal_value_t's on a list. So first pack the number + * of list elements */ + count = opal_list_get_size(cache); + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* now pack each element on the list */ + OPAL_LIST_FOREACH(val, cache, opal_value_t) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&val, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } else { + /* pack a zero to indicate no job info is being passed */ + count = 0; + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* pack the personality */ count = opal_argv_count(jobs[i]->personality); if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &count, 1, OPAL_INT32))) { @@ -134,14 +181,18 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } - if (orte_no_vm && 0 < jobs[i]->num_procs) { - for (j=0; j < jobs[i]->procs->size; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { - continue; - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&proc, 1, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + if (0 < jobs[i]->num_procs) { + /* check attributes to see if this job is to be fully + * described in the launch msg */ + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + for (j=0; j < jobs[i]->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { + continue; + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&proc, 1, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } } } } @@ -198,53 +249,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } - - /* pack the attributes that need to be sent */ - count = 0; - OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { - if (ORTE_ATTR_GLOBAL == kv->local) { - ++count; - } - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { - if (ORTE_ATTR_GLOBAL == kv->local) { - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - /* check for job info attribute */ - cache = NULL; - if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) && - NULL != cache) { - /* we need to pack these as well, but they are composed - * of opal_value_t's on a list. So first pack the number - * of list elements */ - count = opal_list_get_size(cache); - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* now pack each element on the list */ - OPAL_LIST_FOREACH(val, cache, opal_value_t) { - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&val, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } else { - /* pack a zero to indicate no job info is being passed */ - count = 0; - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } } return ORTE_SUCCESS; } @@ -594,7 +598,11 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } - + /* pack the last mapper */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* pack the policies */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapping), 1, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 93df939c8fb..6e49c160520 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -95,6 +95,44 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the attributes */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv, + &n, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value + opal_list_append(&jobs[i]->attributes, &kv->super); + } + /* unpack any job info */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 < count){ + cache = OBJ_NEW(opal_list_t); + orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR); + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val, + &n, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(cache, &val->super); + } + } + /* unpack the personality */ n=1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &n, OPAL_INT32))) { @@ -147,16 +185,20 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } - if (orte_no_vm && 0 < jobs[i]->num_procs) { - orte_proc_t *proc; - for (j=0; j < jobs[i]->num_procs; j++) { - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &proc, &n, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + if (0 < jobs[i]->num_procs) { + /* check attributes to see if this job was fully + * described in the launch msg */ + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + orte_proc_t *proc; + for (j=0; j < jobs[i]->num_procs; j++) { + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &proc, &n, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_pointer_array_add(jobs[i]->procs, proc); } - opal_pointer_array_add(jobs[i]->procs, proc); } } @@ -204,44 +246,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, ORTE_ERROR_LOG(rc); return rc; } - - /* unpack the attributes */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, - &n, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - for (k=0; k < count; k++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv, - &n, ORTE_ATTRIBUTE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value - opal_list_append(&jobs[i]->attributes, &kv->super); - } - /* unpack any job info */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, - &n, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 < count){ - cache = OBJ_NEW(opal_list_t); - orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR); - for (k=0; k < count; k++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val, - &n, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - opal_list_append(cache, &val->super); - } - } } return ORTE_SUCCESS; @@ -655,6 +659,14 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the last mapper */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &(maps[i]->last_mapper), &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack the policies */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index 807f13f5911..605b0acd077 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -12,7 +12,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2016 Los Alamos National Security, LLC. * All rights reserved - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -111,6 +111,8 @@ OBJ_CLASS_INSTANCE(orte_data_req_t, static opal_pointer_array_t orte_data_server_store; static opal_list_t pending; static bool initialized = false; +static int orte_data_server_output = -1; +static int orte_data_server_verbosity = -1; int orte_data_server_init(void) { @@ -121,6 +123,19 @@ int orte_data_server_init(void) } initialized = true; + /* register a verbosity */ + orte_data_server_verbosity = -1; + (void) mca_base_var_register ("orte", "orte", "data", "server_verbose", + "Debug verbosity for ORTE data server", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_data_server_verbosity); + if (0 <= orte_data_server_verbosity) { + orte_data_server_output = opal_output_open(NULL); + opal_output_set_verbosity(orte_data_server_output, + orte_data_server_verbosity); + } + OBJ_CONSTRUCT(&orte_data_server_store, opal_pointer_array_t); if (ORTE_SUCCESS != (rc = opal_pointer_array_init(&orte_data_server_store, 1, @@ -180,7 +195,7 @@ void orte_data_server(int status, orte_process_name_t* sender, orte_data_req_t *req, *rqnext; orte_jobid_t jobid = ORTE_JOBID_INVALID; - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server got message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); @@ -218,7 +233,7 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: publishing data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&data->owner))); @@ -245,7 +260,7 @@ void orte_data_server(int status, orte_process_name_t* sender, data->uid = iptr->data.uint32; OBJ_RELEASE(iptr); } else { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: adding %s to data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -255,7 +270,7 @@ void orte_data_server(int status, orte_process_name_t* sender, data->index = opal_pointer_array_add(&orte_data_server_store, data); - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: checking for pending requests", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -276,7 +291,14 @@ void orte_data_server(int status, orte_process_name_t* sender, for (i=0; NULL != req->keys[i]; i++) { /* cycle thru the data keys for matches */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tCHECKING %s TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + iptr->key, req->keys[i])); if (0 == strcmp(iptr->key, req->keys[i])) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s data server: packaging return", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* found it - package it for return */ if (NULL == reply) { reply = OBJ_NEW(opal_buffer_t); @@ -296,7 +318,7 @@ void orte_data_server(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(rc); break; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: adding %s data from %s to response", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -309,7 +331,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } if (NULL != reply) { /* send it back to the requestor */ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: returning data to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&req->requestor))); @@ -326,11 +348,11 @@ void orte_data_server(int status, orte_process_name_t* sender, reply = NULL; /* if the persistence is "first_read", then delete this data */ if (OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s NOT STORING DATA FROM %s AT INDEX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&data->owner), data->index)); - opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL); + ORTE_NAME_PRINT(&data->owner), data->index); + opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL)); OBJ_RELEASE(data); goto release; } @@ -349,7 +371,7 @@ void orte_data_server(int status, orte_process_name_t* sender, break; case ORTE_PMIX_LOOKUP_CMD: - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: lookup data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); @@ -416,7 +438,7 @@ void orte_data_server(int status, orte_process_name_t* sender, /* cycle across the provided keys */ ret_packed = false; for (i=0; NULL != keys[i]; i++) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: looking for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i])); /* cycle across the stored data, looking for a match */ @@ -428,6 +450,10 @@ void orte_data_server(int status, orte_process_name_t* sender, } /* for security reasons, can only access data posted by the same user id */ if (uid != data->uid) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tMISMATCH UID %u %u", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (unsigned)uid, (unsigned)data->uid)); continue; } /* if the published range is constrained to namespace, then only @@ -435,12 +461,17 @@ void orte_data_server(int status, orte_process_name_t* sender, * in the same namespace as the requestor */ if (OPAL_PMIX_RANGE_NAMESPACE == data->range) { if (jobid != data->owner.jobid) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tMISMATCH JOBID %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobid), + ORTE_JOBID_PRINT(data->owner.jobid))); continue; } } /* see if we have this key */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s COMPARING %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i], iptr->key)); @@ -461,7 +492,7 @@ void orte_data_server(int status, orte_process_name_t* sender, opal_argv_free(keys); goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: adding %s to data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -473,7 +504,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } } if (data_added && OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s REMOVING DATA FROM %s AT INDEX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&data->owner), data->index)); @@ -483,14 +514,14 @@ void orte_data_server(int status, orte_process_name_t* sender, } } if (!ret_packed) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: data not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we were told to wait for the data, then queue this up * for later processing */ if (wait) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: pushing request to wait", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(answer); @@ -510,7 +541,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } opal_argv_free(keys); - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: data found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto SEND_ANSWER; @@ -524,7 +555,7 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: unpublish data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&requestor))); @@ -629,7 +660,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } SEND_ERROR: - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: sending error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); @@ -646,5 +677,3 @@ void orte_data_server(int status, orte_process_name_t* sender, OBJ_RELEASE(answer); } } - - diff --git a/orte/test/mpi/Makefile b/orte/test/mpi/Makefile index 3bf63b8b0b3..47f183a6e57 100644 --- a/orte/test/mpi/Makefile +++ b/orte/test/mpi/Makefile @@ -1,4 +1,11 @@ -PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib +PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \ + concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \ + bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \ + crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \ + parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort \ + debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info \ + info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib \ + no-disconnect all: $(PROGS) diff --git a/orte/test/mpi/no-disconnect.c b/orte/test/mpi/no-disconnect.c new file mode 100644 index 00000000000..9403b3ff345 --- /dev/null +++ b/orte/test/mpi/no-disconnect.c @@ -0,0 +1,210 @@ +/* Contributed by Marcia Cristina Cera + , + http://www.open-mpi.org/community/lists/users/2009/12/11540.php */ + +/* It was decided that the issue highlighted by this test will NOT be + fixed in the 1.3/1.4 series. It is already fixed in the 1.5 + series. Hence, if we detect Open MPI < v1.5, return 77/skip. */ +/* Turns out the hnp cannot handle concurrent MPI_Comm_spawns + as of Open MPI 1.7. However, we hope this feature will + work in 2.0. with the new state machine based orte. */ + +#include +#include +#include +#include +#include +#include + +#include + +#define NCHARS 30 +const int max_depth = 4; + +/* + * Here are some replacements for standard, blocking MPI + * functions. These replacements are "nice" and yield the + * CPU instead of spinning hard. The interfaces are the same. + * Just replace: + * MPI_Recv with nice_recv + * MPI_Send with nice_send + * MPI_Barrier with nice_barrier + */ + + +static int nice_send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm) { + /* Assume a standard (presumably short/eager) send suffices. */ + return MPI_Send(buf, count, datatype, dest, tag, comm); +} + + +static int nice_recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status) { + MPI_Request req; + int flag; + struct timespec dt; + + /* + * We're only interested in modest levels of oversubscription + * -- e.g., 2-4x more processes than physical processors. + * So, the sleep time only needs to be about 2-4x longer than + * a futile MPI_Test call. For a wide range of processors, + * something less than a millisecond should be sufficient. + * Excessive sleep times (e.g., 1 second) would degrade performance. + */ + dt.tv_sec = 0; + dt.tv_nsec = 100000; + + MPI_Irecv(buf, count, datatype, source, tag, comm, &req); + + MPI_Test(&req, &flag, status); + while ( ! flag ) { + nanosleep(&dt, NULL); + MPI_Test(&req, &flag, status); + } + return MPI_SUCCESS; +} + + +static void nice_barrier(MPI_Comm comm) { + int me, np, jump, buf = -1; + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&np); + + /* fan in */ + for ( jump = 1; jump < np; jump <<= 1 ) { + if ( ( me & jump ) != 0 ) { + nice_send(&buf, 1, MPI_INT, me - jump, 343, comm); + break; + } else if ( me + jump < np ) { + nice_recv(&buf, 1, MPI_INT, me + jump, 343, comm, MPI_STATUS_IGNORE); + } + } + + /* fan out */ + if ( 0 != me ) { + nice_recv(&buf, 1, MPI_INT, me - jump, 344, comm, MPI_STATUS_IGNORE); + } + jump >>= 1; + for ( ; jump > 0; jump >>= 1 ) { + if ( me + jump < np ) { + nice_send(&buf, 1, MPI_INT, me + jump, 344, comm); + } + } +} + + +int main (int argc, char **argv) +{ + char bufs [NCHARS]; /* send buffer */ + char bufr[2][NCHARS]; /* recv buffers */ + MPI_Comm parent; + int level = 0, participate = 1; + struct utsname buf; + + /* If this is prior to OMPI v2.0, return 77/skip */ +#if defined(OPEN_MPI) + if (OMPI_MAJOR_VERSION < 2) { + printf("Skipping, because the orte cannot handle concurrent MPI_Comm_spawns\n"); + return 77; + } else { + printf("Verify that this test is truly working because conncurrent MPI_Comm_spawns" + " has not worked before.\n"); + } +#endif + + uname(&buf); + printf("I AM pid %d with level %d on %s\n", getpid(), (argc < 2)?0:atoi(argv[1]), buf.nodename); + + MPI_Init(&argc, &argv); + MPI_Comm_get_parent(&parent); + + if (MPI_COMM_NULL != parent) { + /* spawned processes get stuff from parent */ + level = atoi(argv[1]); + MPI_Recv(&bufr[0], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, parent, MPI_STATUS_IGNORE); + printf("Parent sent: %s\n", bufr[0]); + } else { + + /* original processes have to decide whether to participate */ + + /* In this test, each process launched by "mpirun -n " spawns a + * binary tree of processes. You end up with * ( 1 << max_depth ) + * processes altogether. For max_depth=4, this means 16*. There + * is potential here for heavy oversubscription, especially if in + * testing we launch tests with set to the number of available + * processors. This test tolerates oversubscription somewhat since + * it entails little inter-process synchronization. Nevertheless, + * we try to idle all but /4 of the original processes, using a + * minimum of at least two processes + */ + + int me, np; + + MPI_Comm_size(MPI_COMM_WORLD,&np); + MPI_Comm_rank(MPI_COMM_WORLD,&me); + + if ( np > 4 ) { + /* turn off all but every 4th process */ + if ( ( me & 3 ) != 0 ) participate = 0; + } else + if ( np > 2 ) { + /* turn off all but every 2nd process */ + if ( ( me & 1 ) != 0 ) participate = 0; + } + } + + /* all spawned processes and selected "root" processes participate */ + if ( participate ) { + printf("level = %d\n", level); + + /* prepare send buffer */ + sprintf(bufs,"level %d (pid:%d)", level, getpid()); + + /* spawn */ + if (level < max_depth) { + int i, nspawn = 2, errcodes[1]; + MPI_Request req[2]; + MPI_Comm comm[2]; + char argv1[NCHARS]; + char *args[2]; + + /* level 0 spawns only one process to mimic the original test */ + if ( level == 0 ) nspawn = 1; + + /* prepare command line arguments */ + snprintf(argv1, sizeof(argv1), "%d", level+1); + args[0] = argv1; + args[1] = NULL; + + /* spawn, with a message sent to and received from each child */ + for ( i = 0; i < nspawn; i++ ) { + MPI_Comm_spawn(argv[0], args, 1, MPI_INFO_NULL, 0, MPI_COMM_SELF, + &comm[i], errcodes); + MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, comm[i]); + MPI_Irecv(&bufr[i], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, comm[i], &req[i]); + } + + /* wait for messages from children and print them */ + MPI_Waitall(nspawn, req, MPI_STATUSES_IGNORE); + for ( i = 0; i < nspawn; i++ ) + printf("Child %d sent: %s\n", i, bufr[i]); + } + + /* send message back to parent */ + if (MPI_COMM_NULL != parent) { + MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, parent); + } + } + + /* non-participating processes wait at this barrier for their peers */ + /* (This barrier won't cost that many CPU cycles.) */ + if (MPI_COMM_NULL == parent) { + nice_barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + return 0; +} diff --git a/orte/util/attr.c b/orte/util/attr.c index 1f447f4a87c..a2d6ed48a7d 100644 --- a/orte/util/attr.c +++ b/orte/util/attr.c @@ -286,6 +286,8 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key) return "ORTE_JOB_TRANSPORT_KEY"; case ORTE_JOB_INFO_CACHE: return "ORTE_JOB_INFO_CACHE"; + case ORTE_JOB_FULLY_DESCRIBED: + return "ORTE_JOB_FULLY_DESCRIBED"; case ORTE_PROC_NOBARRIER: return "PROC-NOBARRIER"; diff --git a/orte/util/attr.h b/orte/util/attr.h index 1b961030091..817581e38b6 100644 --- a/orte/util/attr.h +++ b/orte/util/attr.h @@ -143,6 +143,7 @@ typedef uint16_t orte_job_flags_t; #define ORTE_JOB_NOTIFY_COMPLETION (ORTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates #define ORTE_JOB_TRANSPORT_KEY (ORTE_JOB_START_KEY + 51) // string - transport keys assigned to this job #define ORTE_JOB_INFO_CACHE (ORTE_JOB_START_KEY + 52) // opal_list_t - list of opal_value_t to be included in job_info +#define ORTE_JOB_FULLY_DESCRIBED (ORTE_JOB_START_KEY + 53) // bool - job is fully described in launch msg #define ORTE_JOB_MAX_KEY 300 diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 3b2ec9bdfeb..ca4948fcbca 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -198,7 +198,7 @@ int orte_util_build_daemon_nidmap(void) return rc; } -int orte_util_nidmap_create(char **regex) +int orte_util_nidmap_create(opal_pointer_array_t *pool, char **regex) { char *node; char prefix[ORTE_MAX_NODE_PREFIX]; @@ -217,8 +217,8 @@ int orte_util_nidmap_create(char **regex) OBJ_CONSTRUCT(&dvpids, opal_list_t); rng = NULL; - for (n=0; n < orte_node_pool->size; n++) { - if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + for (n=0; n < pool->size; n++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(pool, n))) { continue; } /* if no daemon has been assigned, then this node is not being used */ @@ -1180,3 +1180,217 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) OPAL_LIST_DESTRUCT(&flgs); return rc; } + +typedef struct { + opal_list_item_t super; + int ctx; + int nprocs; + int cnt; +} orte_nidmap_regex_t; +static void nrcon(orte_nidmap_regex_t *p) +{ + p->ctx = 0; + p->nprocs = -1; + p->cnt = 0; +} +static OBJ_CLASS_INSTANCE(orte_nidmap_regex_t, + opal_list_item_t, + nrcon, NULL); + +/* since not every node is involved in a job, we have to create a + * regex that indicates the ppn for every node, marking those that + * are not involved. Since each daemon knows the entire + * node pool, we simply provide a ppn for every daemon, with a -1 + * to indicate that the node is empty for that job */ +int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn) +{ + orte_nidmap_regex_t *prng, **actives; + opal_list_t *prk; + orte_node_t *nptr; + orte_proc_t *proc; + size_t n; + int *cnt, i, k; + char *tmp2, *ptmp, **cache = NULL; + + /* create an array of lists to handle the number of app_contexts in this job */ + prk = (opal_list_t*)malloc(jdata->num_apps * sizeof(opal_list_t)); + cnt = (int*)malloc(jdata->num_apps * sizeof(int)); + actives = (orte_nidmap_regex_t**)malloc(jdata->num_apps * sizeof(orte_nidmap_regex_t*)); + for (n=0; n < jdata->num_apps; n++) { + OBJ_CONSTRUCT(&prk[n], opal_list_t); + actives[n] = NULL; + } + + /* we provide a complete map in the regex, with an entry for every + * node in the pool */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + /* if a daemon has been assigned, then count how many procs + * for each app_context from the specified job are assigned to this node */ + memset(cnt, 0, jdata->num_apps * sizeof(int)); + if (NULL != nptr->daemon) { + for (k=0; k < nptr->procs->size; k++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(nptr->procs, k))) { + if (proc->name.jobid == jdata->jobid) { + ++cnt[proc->app_idx]; + } + } + } + } + /* track the #procs on this node */ + for (n=0; n < jdata->num_apps; n++) { + if (NULL == actives[n]) { + /* just starting */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } else { + /* is this the next in line */ + if (cnt[n] == actives[n]->nprocs) { + actives[n]->cnt++; + } else { + /* need to start another range */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } + } + } + } + + /* construct the regex from the found ranges for each app_context */ + ptmp = NULL; + for (n=0; n < jdata->num_apps; n++) { + OPAL_LIST_FOREACH(prng, &prk[n], orte_nidmap_regex_t) { + if (1 < prng->cnt) { + if (NULL == ptmp) { + asprintf(&ptmp, "%u(%u)", prng->nprocs, prng->cnt); + } else { + asprintf(&tmp2, "%s,%u(%u)", ptmp, prng->nprocs, prng->cnt); + free(ptmp); + ptmp = tmp2; + } + } else { + if (NULL == ptmp) { + asprintf(&ptmp, "%u", prng->nprocs); + } else { + asprintf(&tmp2, "%s,%u", ptmp, prng->nprocs); + free(ptmp); + ptmp = tmp2; + } + } + } + OPAL_LIST_DESTRUCT(&prk[n]); // releases all the actives objects + opal_argv_append_nosize(&cache, ptmp); + free(ptmp); + ptmp = NULL; + } + free(prk); + free(cnt); + free(actives); + + *ppn = opal_argv_join(cache, '@'); + opal_argv_free(cache); + + return ORTE_SUCCESS; +} + +int orte_util_nidmap_parse_ppn(orte_job_t *jdata, char *regex) +{ + orte_node_t *node; + orte_proc_t *proc; + int n, k, m, cnt; + char **tmp, *ptr, **ppn; + orte_nidmap_regex_t *rng; + opal_list_t trk; + int rc = ORTE_SUCCESS; + + /* split the regex by app_context */ + tmp = opal_argv_split(regex, '@'); + + /* for each app_context, set the ppn */ + for (n=0; NULL != tmp[n]; n++) { + ppn = opal_argv_split(tmp[n], ','); + /* decompress the ppn */ + OBJ_CONSTRUCT(&trk, opal_list_t); + for (m=0; NULL != ppn[m]; m++) { + rng = OBJ_NEW(orte_nidmap_regex_t); + opal_list_append(&trk, &rng->super); + /* check for a count */ + if (NULL != (ptr = strchr(ppn[m], '('))) { + ppn[m][strlen(ppn[m])-1] = '\0'; // remove trailing paren + *ptr = '\0'; + ++ptr; + rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; + } + /* convert the number */ + rng->nprocs = strtoul(ppn[m], NULL, 10); + } + opal_argv_free(ppn); + + /* cycle thru our node pool and add the indicated number of procs + * to each node */ + rng = (orte_nidmap_regex_t*)opal_list_get_first(&trk); + cnt = 0; + for (m=0; m < orte_node_pool->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, m))) { + continue; + } + /* see if it has any procs for this job and app_context */ + if (0 < rng->nprocs) { + /* add this node to the job map if it isn't already there */ + if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { + OBJ_RETAIN(node); + ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); + opal_pointer_array_add(jdata->map->nodes, node); + } + /* create a proc object for each one */ + for (k=0; k < rng->nprocs; k++) { + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = jdata->jobid; + /* leave the vpid undefined as this will be determined + * later when we do the overall ranking */ + proc->app_idx = n; + proc->parent = node->daemon->name.vpid; + OBJ_RETAIN(node); + proc->node = node; + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + opal_pointer_array_add(node->procs, proc); + /* we will add the proc to the jdata array when we + * compute its rank */ + } + node->num_procs += rng->nprocs; + } + ++cnt; + if (rng->cnt <= cnt) { + rng = (orte_nidmap_regex_t*)opal_list_get_next(&rng->super); + if (NULL == rng) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + opal_argv_free(tmp); + rc = ORTE_ERR_NOT_FOUND; + goto complete; + } + cnt = 0; + } + } + OPAL_LIST_DESTRUCT(&trk); + } + opal_argv_free(tmp); + + complete: + /* reset any node map flags we used so the next job will start clean */ + for (n=0; n < jdata->map->nodes->size; n++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + + return rc; +} diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 3acc29b9277..e8c6f59bc21 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -46,7 +46,7 @@ BEGIN_C_DECLS ORTE_DECLSPEC void orte_util_nidmap_init(void); -ORTE_DECLSPEC int orte_util_nidmap_create(char **regex); +ORTE_DECLSPEC int orte_util_nidmap_create(opal_pointer_array_t *pool, char **regex); ORTE_DECLSPEC int orte_util_nidmap_parse(char *regex); /* create a regular expression describing the nodes in the @@ -59,6 +59,12 @@ ORTE_DECLSPEC int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer); ORTE_DECLSPEC int orte_util_build_daemon_nidmap(void); +/* create a regular expression describing the ppn for a job */ +ORTE_DECLSPEC int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn); + +/* decode the ppn */ +ORTE_DECLSPEC int orte_util_nidmap_parse_ppn(orte_job_t *jdata, char *ppn); + END_C_DECLS #endif