From 657e701c6505e401412b5548c180a22c76832bf9 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 12 May 2017 16:16:47 -0700 Subject: [PATCH] Add debug verbosity to the orte data server and pmix pub/lookup functions Start updating the various mappers to the new procedure. Remove the stale lama component as it is now very out-of-date. Bring round_robin and PPR online, and modify the mindist component (but cannot test/debug it). Remove unneeded test Fix memory corruption by re-initializing variable to NULL in loop Resolve the race condition identified by @ggouaillardet by resetting the mapped flag within the same event where it was set. There is no need to retain the flag beyond that point as it isn't used again. Add a new job attribute ORTE_JOB_FULLY_DESCRIBED to indicate that all the job information (including locations and binding) is included in the launch message. Thus, the backend daemons do not need to do any map computation for the job. Use this for the seq, rankfile, and mindist mappers until someone decides to update them. Note that this will maintain functionality, but means that users of those three mappers will see large launch messages and less performant scaling than those using the other mappers. Have the mindist module add procs to the job's proc array as it is a fully described module Protect the hnp-not-in-allocation case Per path suggested by Gilles - protect the HNP node when it gets added in the absence of any other allocation or hostfile Signed-off-by: Ralph Castain --- .gitignore | 1 + opal/mca/pmix/base/pmix_base_fns.c | 36 +- orte/mca/odls/base/odls_base_default_fns.c | 203 +- orte/mca/plm/base/plm_base_launch_support.c | 4 +- orte/mca/rmaps/base/Makefile.am | 5 +- orte/mca/rmaps/base/base.h | 3 +- orte/mca/rmaps/base/help-orte-rmaps-base.txt | 12 +- .../rmaps/base/rmaps_base_assign_locations.c | 80 + orte/mca/rmaps/base/rmaps_base_map_job.c | 63 +- orte/mca/rmaps/base/rmaps_base_ranking.c | 685 +++--- orte/mca/rmaps/base/rmaps_base_support_fns.c | 36 +- orte/mca/rmaps/base/rmaps_private.h | 5 +- orte/mca/rmaps/lama/.opal_ignore | 0 orte/mca/rmaps/lama/Makefile.am | 40 - orte/mca/rmaps/lama/help-orte-rmaps-lama.txt | 173 -- orte/mca/rmaps/lama/owner.txt | 7 - orte/mca/rmaps/lama/rmaps_lama.h | 177 -- orte/mca/rmaps/lama/rmaps_lama_component.c | 136 -- orte/mca/rmaps/lama/rmaps_lama_max_tree.c | 1182 ---------- orte/mca/rmaps/lama/rmaps_lama_module.c | 1914 ----------------- orte/mca/rmaps/lama/rmaps_lama_params.c | 878 -------- orte/mca/rmaps/mindist/rmaps_mindist_module.c | 115 +- orte/mca/rmaps/ppr/rmaps_ppr.c | 128 +- orte/mca/rmaps/rank_file/rmaps_rank_file.c | 15 +- orte/mca/rmaps/resilient/rmaps_resilient.c | 44 +- orte/mca/rmaps/rmaps.h | 17 +- orte/mca/rmaps/round_robin/Makefile.am | 4 +- orte/mca/rmaps/round_robin/rmaps_rr.c | 118 +- orte/mca/rmaps/round_robin/rmaps_rr.h | 9 +- orte/mca/rmaps/round_robin/rmaps_rr_assign.c | 171 ++ orte/mca/rmaps/seq/rmaps_seq.c | 6 +- orte/mca/state/base/state_base_fns.c | 2 - orte/mca/state/dvm/state_dvm.c | 2 +- orte/mca/state/hnp/state_hnp.c | 4 + orte/mca/state/novm/state_novm.c | 31 +- orte/orted/pmix/pmix_server_pub.c | 8 + .../data_type_support/orte_dt_packing_fns.c | 120 +- .../data_type_support/orte_dt_unpacking_fns.c | 106 +- orte/runtime/orte_data_server.c | 73 +- orte/test/mpi/Makefile | 9 +- orte/test/mpi/no-disconnect.c | 210 ++ orte/util/attr.c | 2 + orte/util/attr.h | 1 + orte/util/nidmap.c | 220 +- orte/util/nidmap.h | 8 +- 45 files changed, 1886 insertions(+), 5177 deletions(-) create mode 100644 orte/mca/rmaps/base/rmaps_base_assign_locations.c delete mode 100644 orte/mca/rmaps/lama/.opal_ignore delete mode 100644 orte/mca/rmaps/lama/Makefile.am delete mode 100644 orte/mca/rmaps/lama/help-orte-rmaps-lama.txt delete mode 100644 orte/mca/rmaps/lama/owner.txt delete mode 100644 orte/mca/rmaps/lama/rmaps_lama.h delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_component.c delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_max_tree.c delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_module.c delete mode 100644 orte/mca/rmaps/lama/rmaps_lama_params.c create mode 100644 orte/mca/rmaps/round_robin/rmaps_rr_assign.c create mode 100644 orte/test/mpi/no-disconnect.c diff --git a/.gitignore b/.gitignore index 36908c03f07..1228a7948ed 100644 --- a/.gitignore +++ b/.gitignore @@ -415,6 +415,7 @@ orte/test/mpi/memcached-dummy orte/test/mpi/coll_test orte/test/mpi/badcoll orte/test/mpi/iof +orte/test/mpi/no-disconnect orte/test/system/radix orte/test/system/sigusr_trap diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index bee99bd8062..cb9e4ccf43f 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Mellanox Technologies, Inc. @@ -118,6 +118,12 @@ static void lookup_cbfunc(int status, opal_list_t *data, void *cbdata) cd->active = false; } +static void opcbfunc(int status, void *cbdata) +{ + struct lookup_caddy_t *cd = (struct lookup_caddy_t*)cbdata; + cd->active = false; +} + int opal_pmix_base_exchange(opal_value_t *indat, opal_pmix_pdata_t *outdat, int timeout) @@ -141,11 +147,29 @@ int opal_pmix_base_exchange(opal_value_t *indat, opal_list_append(&ilist, &info->super); /* publish it with "session" scope */ - rc = opal_pmix.publish(&ilist); - OPAL_LIST_DESTRUCT(&ilist); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; + if (NULL == opal_pmix.publish_nb) { + rc = opal_pmix.publish(&ilist); + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + return rc; + } + } else { + caddy.active = true; + rc = opal_pmix.publish_nb(&ilist, opcbfunc, &caddy); + if (OPAL_SUCCESS != rc) { + OPAL_ERROR_LOG(rc); + OPAL_LIST_DESTRUCT(&ilist); + return rc; + } + while (caddy.active) { + usleep(10); + } + OPAL_LIST_DESTRUCT(&ilist); + if (OPAL_SUCCESS != caddy.status) { + OPAL_ERROR_LOG(caddy.status); + return caddy.status; + } } /* lookup the other side's info - if a non-blocking form diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 30462ac4faa..8ce47c18e3b 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -131,7 +131,7 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, /* if we couldn't provide the allocation regex on the orted * cmd line, then we need to provide all the info here */ if (!orte_nidmap_communicated) { - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) { ORTE_ERROR_LOG(rc); return rc; } @@ -246,6 +246,22 @@ int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer, return rc; } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* compute and pack the ppn regex */ + if (ORTE_SUCCESS != (rc = orte_util_nidmap_generate_ppn(jdata, &nidmap))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &nidmap, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + free(nidmap); + return rc; + } + free(nidmap); + } + + /* compute and pack the regex of ppn */ + return ORTE_SUCCESS; } @@ -262,13 +278,12 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, int rc; orte_std_cntr_t cnt; orte_job_t *jdata=NULL, *daemons; - int32_t n, k, m; + int32_t n, k; opal_buffer_t *bptr; - orte_node_t *node; orte_proc_t *pptr, *dmn; orte_app_context_t *app; - bool newmap = false; int8_t flag; + char *ppn; OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, "%s odls:constructing child list", @@ -356,7 +371,7 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, * the storage */ jdata->jobid = ORTE_JOBID_INVALID; OBJ_RELEASE(jdata); - /* get the correct job object */ + /* get the correct job object - it will be completely filled out */ if (NULL == (jdata = orte_get_job_data_object(*job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; @@ -364,25 +379,65 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } else { opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); - } - /* ensure the map object is present */ - if (NULL == jdata->map) { - jdata->map = OBJ_NEW(orte_job_map_t); - newmap = true; + /* ensure the map object is present */ + if (NULL == jdata->map) { + jdata->map = OBJ_NEW(orte_job_map_t); + } } - if (orte_no_vm) { - /* if we are operating novm, then mpirun will have sent us - * the complete array of procs - process it */ - for (n=0; n < jdata->procs->size; n++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { - continue; + /* if the job is fully described, then mpirun will have computed + * and sent us the complete array of procs in the orte_job_t, so we + * don't need to do anything more here */ + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + if (!ORTE_PROC_IS_HNP) { + /* extract the ppn regex */ + cnt = 1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &ppn, &cnt, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; } - if (ORTE_PROC_STATE_UNDEF == pptr->state) { - /* not ready for use yet */ - continue; + /* populate the node array of the job map and the proc array of + * the job object so we know how many procs are on each node */ + if (ORTE_SUCCESS != (rc = orte_util_nidmap_parse_ppn(jdata, ppn))) { + ORTE_ERROR_LOG(rc); + free(ppn); + goto REPORT_ERROR; + } + free(ppn); + /* now assign locations to the procs */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; } + } + /* compute the ranks and add the proc objects + * to the jdata->procs array */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + /* and finally, compute the local and node ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + goto REPORT_ERROR; + } + } + + /* now that the node array in the job map and jdata are completely filled out,. + * we need to "wireup" the procs to their nodes so other utilities can + * locate them */ + for (n=0; n < jdata->procs->size; n++) { + if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) { + continue; + } + if (ORTE_PROC_STATE_UNDEF == pptr->state) { + /* not ready for use yet */ + continue; + } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* the parser will have already made the connection, but the fully described + * case won't have done it, so connect the proc to its node here */ opal_output_verbose(5, orte_odls_base_framework.framework_output, "%s GETTING DAEMON FOR PROC %s WITH PARENT %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -401,86 +456,37 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } OBJ_RETAIN(dmn->node); pptr->node = dmn->node; - /* add proc to node - note that num_procs for the - * node was already correctly unpacked, so don't - * increment it here */ - OBJ_RETAIN(pptr); - opal_pointer_array_add(dmn->node->procs, pptr); - - /* add the node to the map, if not already there */ - if (!ORTE_FLAG_TEST(dmn->node, ORTE_NODE_FLAG_MAPPED)) { - OBJ_RETAIN(dmn->node); - ORTE_FLAG_SET(dmn->node, ORTE_NODE_FLAG_MAPPED); - opal_pointer_array_add(jdata->map->nodes, dmn->node); - if (newmap) { - jdata->map->num_nodes++; - } - } - - /* see if it belongs to us */ - if (pptr->parent == ORTE_PROC_MY_NAME->vpid) { - /* is this child on our current list of children */ - if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { - /* not on the local list */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s[%s:%d] adding proc %s to my local list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jdata->num_local_procs++; - /* add this proc to our child list */ - OBJ_RETAIN(pptr); - ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); - opal_pointer_array_add(orte_local_children, pptr); - } - - /* if the job is in restart mode, the child must not barrier when launched */ - if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { - orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); - } - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); - } - } - } else { - /* create the map - will already have been done for the novm case */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ERROR; } - /* find our local procs */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - continue; - } - if (node->index != (int)ORTE_PROC_MY_NAME->vpid) { - continue; + /* see if it belongs to us */ + if (pptr->parent == ORTE_PROC_MY_NAME->vpid) { + /* is this child on our current list of children */ + if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { + /* not on the local list */ + OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, + "%s[%s:%d] adding proc %s to my local list", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + __FILE__, __LINE__, + ORTE_NAME_PRINT(&pptr->name))); + /* keep tabs of the number of local procs */ + jdata->num_local_procs++; + /* add this proc to our child list */ + OBJ_RETAIN(pptr); + ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); + opal_pointer_array_add(orte_local_children, pptr); } - for (m=0; m < node->procs->size; m++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, m))) { - continue; - } - if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) { - /* not on the local list */ - OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output, - "%s[%s:%d] adding proc %s to my local list", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - __FILE__, __LINE__, - ORTE_NAME_PRINT(&pptr->name))); - /* keep tabs of the number of local procs */ - jdata->num_local_procs++; - /* add this proc to our child list */ - OBJ_RETAIN(pptr); - ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL); - opal_pointer_array_add(orte_local_children, pptr); - /* mark that this app_context is being used on this node */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); - } + + /* if the job is in restart mode, the child must not barrier when launched */ + if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { + orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } + /* mark that this app_context is being used on this node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); + ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE); } + } + + if (!ORTE_PROC_IS_HNP && + !orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { /* compute and save bindings of local children */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); @@ -488,13 +494,6 @@ int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer, } } - /* reset any node map flags we used so the next job will start clean */ - for (n=0; n < jdata->map->nodes->size; n++) { - if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); - } - } - /* if we wanted to see the map, now is the time to display it */ if (jdata->map->display_map) { orte_rmaps_base_display_map(jdata); diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index 677535aacf6..0c54807a7e6 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -209,7 +209,7 @@ static void files_ready(int status, void *cbdata) if (ORTE_SUCCESS != status) { ORTE_FORCED_TERMINATE(status); } else { - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SYSTEM_PREP); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } } @@ -1497,7 +1497,7 @@ int orte_plm_base_orted_append_basic_args(int *argc, char ***argv, /* convert the nodes with daemons to a regex */ param = NULL; - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(¶m))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, ¶m))) { ORTE_ERROR_LOG(rc); return rc; } diff --git a/orte/mca/rmaps/base/Makefile.am b/orte/mca/rmaps/base/Makefile.am index 41b0420847c..d2930632ea4 100644 --- a/orte/mca/rmaps/base/Makefile.am +++ b/orte/mca/rmaps/base/Makefile.am @@ -12,7 +12,7 @@ # Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2015 Intel, Inc. All rights reserved. +# Copyright (c) 2015-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -31,7 +31,8 @@ libmca_rmaps_la_SOURCES += \ base/rmaps_base_support_fns.c \ base/rmaps_base_ranking.c \ base/rmaps_base_print_fns.c \ - base/rmaps_base_binding.c + base/rmaps_base_binding.c \ + base/rmaps_base_assign_locations.c dist_ortedata_DATA = base/help-orte-rmaps-base.txt diff --git a/orte/mca/rmaps/base/base.h b/orte/mca/rmaps/base/base.h index b1f540241a7..beb4cee0445 100644 --- a/orte/mca/rmaps/base/base.h +++ b/orte/mca/rmaps/base/base.h @@ -99,7 +99,8 @@ OBJ_CLASS_DECLARATION(orte_rmaps_base_selected_module_t); /* * Map a job */ -ORTE_DECLSPEC int orte_rmaps_base_map_job(orte_job_t *jdata); +ORTE_DECLSPEC void orte_rmaps_base_map_job(int sd, short args, void *cbdata); +ORTE_DECLSPEC int orte_rmaps_base_assign_locations(orte_job_t *jdata); /** * Utility routines to get/set vpid mapping for the job diff --git a/orte/mca/rmaps/base/help-orte-rmaps-base.txt b/orte/mca/rmaps/base/help-orte-rmaps-base.txt index c04acf413d9..2f5f5b5d0c7 100644 --- a/orte/mca/rmaps/base/help-orte-rmaps-base.txt +++ b/orte/mca/rmaps/base/help-orte-rmaps-base.txt @@ -13,7 +13,7 @@ # Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011 Los Alamos National Security, LLC. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -410,3 +410,13 @@ Either the -host or -hostfile options were given, but the number of processes to start was omitted. This combination is not supported. Please specify the number of processes to run and try again. +# +[failed-assignments] +The attempt to assign hardware locations to processes on a +compute node failed: + + Node: %s + Policy: %s + +We cannot continue - please check that the policy is in +accordance with the actual available hardware. diff --git a/orte/mca/rmaps/base/rmaps_base_assign_locations.c b/orte/mca/rmaps/base/rmaps_base_assign_locations.c new file mode 100644 index 00000000000..b1536ded0aa --- /dev/null +++ b/orte/mca/rmaps/base/rmaps_base_assign_locations.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Los Alamos National Security, LLC. + * All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "orte/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/util/show_help.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/base.h" +#include "orte/mca/rmaps/base/rmaps_private.h" + + +int orte_rmaps_base_assign_locations(orte_job_t *jdata) +{ + int rc; + orte_rmaps_base_selected_module_t *mod; + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps: assigning locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* cycle thru the available mappers until one agrees to assign + * locations for the job + */ + if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) { + /* forced selection */ + mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules); + jdata->map->req_mapper = strdup(mod->component->mca_component_name); + } + OPAL_LIST_FOREACH(mod, &orte_rmaps_base.selected_modules, orte_rmaps_base_selected_module_t) { + if (NULL == mod->module->assign_locations) { + continue; + } + if (ORTE_SUCCESS == (rc = mod->module->assign_locations(jdata))) { + return rc; + } + /* mappers return "next option" if they didn't attempt to + * process the job. anything else is a true error. + */ + if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + + /* if we get here without doing the assignments, then that's an error */ + orte_show_help("help-orte-rmaps-base.txt", "failed-assignments", true, + orte_process_info.nodename, + orte_rmaps_base_print_mapping(jdata->map->mapping)); + return ORTE_ERROR; +} diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 8254bcfaf16..d5e2ac304dc 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -42,8 +42,10 @@ #include "orte/mca/rmaps/base/rmaps_private.h" -int orte_rmaps_base_map_job(orte_job_t *jdata) +void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { + orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = caddy->jdata; orte_node_t *node; int rc, i, ppx = 0; bool did_map, given, pernode = false; @@ -116,7 +118,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) /* inform the user of the error */ orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true); OPAL_LIST_DESTRUCT(&nodes); - return ORTE_ERR_BAD_PARAM; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } nprocs += slots; @@ -335,7 +339,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) int i; if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { @@ -368,15 +374,26 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } + /* reset any node map flags we used so the next job will start clean */ + for (i=0; i < jdata->map->nodes->size; i++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) { /* the map was done but nothing could be mapped * for launch as all the resources were busy */ orte_show_help("help-orte-rmaps-base.txt", "cannot-launch", true); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } /* if we get here without doing the map, or with zero procs in @@ -386,7 +403,9 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) orte_show_help("help-orte-rmaps-base.txt", "failed-map", true, did_map ? "mapped" : "unmapped", jdata->num_procs, jdata->map->num_nodes); - return ORTE_ERR_INVALID_NUM_PROCS; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } /* if any node is oversubscribed, then check to see if a binding @@ -399,17 +418,29 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) } } - /* compute and save local ranks */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } + if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + /* compute and save location assignments */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; + } + } else { + /* compute and save local ranks */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; + } - if (orte_no_vm) { /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); - return rc; + OBJ_RELEASE(caddy); + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED); + return; } } @@ -427,7 +458,11 @@ int orte_rmaps_base_map_job(orte_job_t *jdata) } } - return ORTE_SUCCESS; + /* set the job state to the next position */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); + + /* cleanup */ + OBJ_RELEASE(caddy); } void orte_rmaps_base_display_map(orte_job_t *jdata) diff --git a/orte/mca/rmaps/base/rmaps_base_ranking.c b/orte/mca/rmaps/base/rmaps_base_ranking.c index b297290a4d6..cb5d6a09a0c 100644 --- a/orte/mca/rmaps/base/rmaps_base_ranking.c +++ b/orte/mca/rmaps/base/rmaps_base_ranking.c @@ -49,19 +49,17 @@ #include "orte/mca/rmaps/base/base.h" static int rank_span(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { + orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, rc; + int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; - opal_list_item_t *item; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, @@ -85,18 +83,144 @@ static int rank_span(orte_job_t *jdata, * are mapped */ - vpid = jdata->num_procs; - cnt = 0; - while (cnt < app->num_procs) { - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + + cnt = 0; + while (cnt < app->num_procs) { + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + /* get the number of objects - only consider those we can actually use */ + num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, + cache_level, OPAL_HWLOC_AVAILABLE); + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: found %d objects on node %s with %d procs", + num_objs, node->name, (int)node->num_procs); + if (0 == num_objs) { + return ORTE_ERR_NOT_SUPPORTED; + } + + /* for each object */ + for (i=0; i < num_objs && cnt < app->num_procs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, + cache_level, i, OPAL_HWLOC_AVAILABLE); + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: working object %d", i); + + /* cycle thru the procs on this node */ + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already assigned */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + /* ignore procs not on this object */ + if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: proc at position %d is not on object %d", + j, i); + continue; + } + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; + + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; + } + } + } + } + } + + return ORTE_SUCCESS; +} + +static int rank_fill(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level) +{ + orte_app_context_t *app; + hwloc_obj_t obj; + int num_objs, i, j, m, n, rc; + orte_vpid_t num_ranked=0; + orte_node_t *node; + orte_proc_t *proc, *pptr; + orte_vpid_t vpid; + int cnt; + hwloc_obj_t locale; + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_fill: for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* if the ranking is fill, then we rank all the procs + * within a given object before moving on to the next + * + * Node 0 Node 1 + * Obj 0 Obj 1 Obj 0 Obj 1 + * 0 1 4 5 8 9 12 13 + * 2 3 6 7 10 11 14 15 + */ + + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + + cnt = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: found %d objects on node %s with %d procs", + "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", num_objs, node->name, (int)node->num_procs); if (0 == num_objs) { return ORTE_ERR_NOT_SUPPORTED; @@ -108,7 +232,7 @@ static int rank_span(orte_job_t *jdata, cache_level, i, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: working object %d", i); + "mca:rmaps:rank_fill: working object %d", i); /* cycle thru the procs on this node */ for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { @@ -118,7 +242,7 @@ static int rank_span(orte_job_t *jdata, /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", + "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } @@ -130,7 +254,7 @@ static int rank_span(orte_job_t *jdata, if (proc->app_idx != app->idx) { continue; } - /* protect against bozo case */ + /* protect against bozo case */ locale = NULL; if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERROR); @@ -139,19 +263,23 @@ static int rank_span(orte_job_t *jdata, /* ignore procs not on this object */ if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: proc at position %d is not on object %d", + "mca:rmaps:rank_fill: proc at position %d is not on object %d", j, i); continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); + "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; if (0 == cnt) { app->first_rank = proc->name.vpid; } cnt++; - /* insert the proc into the jdata array - no harm if already there */ + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; @@ -160,8 +288,6 @@ static int rank_span(orte_job_t *jdata, * new bookmark */ jdata->bookmark = node; - /* move to next object */ - break; } } } @@ -170,138 +296,26 @@ static int rank_span(orte_job_t *jdata, return ORTE_SUCCESS; } -static int rank_fill(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, - hwloc_obj_type_t target, - unsigned cache_level) -{ - hwloc_obj_t obj; - int num_objs, i, j, rc; - orte_vpid_t num_ranked=0; - orte_node_t *node; - orte_proc_t *proc; - orte_vpid_t vpid; - int cnt; - opal_list_item_t *item; - hwloc_obj_t locale; - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: for job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - - /* if the ranking is fill, then we rank all the procs - * within a given object before moving on to the next - * - * Node 0 Node 1 - * Obj 0 Obj 1 Obj 0 Obj 1 - * 0 1 4 5 8 9 12 13 - * 2 3 6 7 10 11 14 15 - */ - - vpid = jdata->num_procs; - cnt = 0; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - /* get the number of objects - only consider those we can actually use */ - num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level, OPAL_HWLOC_AVAILABLE); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", - num_objs, node->name, (int)node->num_procs); - if (0 == num_objs) { - return ORTE_ERR_NOT_SUPPORTED; - } - - /* for each object */ - for (i=0; i < num_objs && cnt < app->num_procs; i++) { - obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, i, OPAL_HWLOC_AVAILABLE); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: working object %d", i); - - /* cycle thru the procs on this node */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already assigned */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - /* protect against bozo case */ - locale = NULL; - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - ORTE_ERROR_LOG(ORTE_ERROR); - return ORTE_ERROR; - } - /* ignore procs not on this object */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: proc at position %d is not on object %d", - j, i); - continue; - } - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - } - } - - return ORTE_SUCCESS; -} - static int rank_by(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { + orte_app_context_t *app; hwloc_obj_t obj; - int num_objs, i, j, rc; + int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; opal_pointer_array_t objs; bool all_done; - opal_list_item_t *item; hwloc_obj_t locale; if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_span(jdata, app, nodes, target, cache_level); + return rank_span(jdata, target, cache_level); } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { - return rank_fill(jdata, app, nodes, target, cache_level); + return rank_fill(jdata, target, cache_level); } /* if ranking is not spanned or filled, then we @@ -316,122 +330,140 @@ static int rank_by(orte_job_t *jdata, * 4 6 5 7 12 14 13 15 */ - /* setup the pointer array */ - OBJ_CONSTRUCT(&objs, opal_pointer_array_t); - opal_pointer_array_init(&objs, 2, INT_MAX, 2); - - vpid = jdata->num_procs; - cnt = 0; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - /* get the number of objects - only consider those we can actually use */ - num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, - cache_level, OPAL_HWLOC_AVAILABLE); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: found %d objects on node %s with %d procs", - num_objs, node->name, (int)node->num_procs); - if (0 == num_objs) { - return ORTE_ERR_NOT_SUPPORTED; - } - /* collect all the objects */ - for (i=0; i < num_objs; i++) { - obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, - cache_level, i, OPAL_HWLOC_AVAILABLE); - opal_pointer_array_set_item(&objs, i, obj); + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; } - /* cycle across the objects, assigning a proc to each one, - * until all procs have been assigned - unfortunately, since - * more than this job may be mapped onto a node, the number - * of procs on the node can't be used to tell us when we - * are done. Instead, we have to just keep going until all - * procs are ranked - which means we have to make one extra - * pass thru the loop - * - * Perhaps someday someone will come up with a more efficient - * algorithm, but this works for now. - */ - all_done = false; - while (!all_done && cnt < app->num_procs) { - all_done = true; - /* cycle across the objects */ - for (i=0; i < num_objs && cnt < app->num_procs; i++) { - obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + /* setup the pointer array */ + OBJ_CONSTRUCT(&objs, opal_pointer_array_t); + opal_pointer_array_init(&objs, 2, INT_MAX, 2); - /* find the next proc on this object */ - for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", - ORTE_NAME_PRINT(&proc->name), num_ranked); - continue; - } - /* ignore procs that are already ranked */ - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { - continue; - } - /* ignore procs on other objects */ - if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + cnt = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + + /* get the number of objects - only consider those we can actually use */ + num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, + cache_level, OPAL_HWLOC_AVAILABLE); + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: found %d objects on node %s with %d procs", + num_objs, node->name, (int)node->num_procs); + if (0 == num_objs) { + OBJ_DESTRUCT(&objs); + return ORTE_ERR_NOT_SUPPORTED; + } + /* collect all the objects */ + for (i=0; i < num_objs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, + cache_level, i, OPAL_HWLOC_AVAILABLE); + opal_pointer_array_set_item(&objs, i, obj); + } + + /* cycle across the objects, assigning a proc to each one, + * until all procs have been assigned - unfortunately, since + * more than this job may be mapped onto a node, the number + * of procs on the node can't be used to tell us when we + * are done. Instead, we have to just keep going until all + * procs are ranked - which means we have to make one extra + * pass thru the loop + * + * Perhaps someday someone will come up with a more efficient + * algorithm, but this works for now. + */ + all_done = false; + while (!all_done && cnt < app->num_procs) { + all_done = true; + /* cycle across the objects */ + for (i=0; i < num_objs && cnt < app->num_procs && all_done; i++) { + obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); + /* find the next proc for this job and app_context */ + for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs that are already ranked */ + if (ORTE_VPID_INVALID != proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - already ranked, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by skipping proc %s - from another app, num_ranked %d", + ORTE_NAME_PRINT(&proc->name), num_ranked); + continue; + } + /* protect against bozo case */ + locale = NULL; + if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + /* ignore procs not on this object */ + if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rank_by: proc at position %d is not on object %d", + j, i); + continue; + } + /* assign the vpid */ + proc->name.vpid = vpid++; + if (0 == cnt) { + app->first_rank = proc->name.vpid; + } + cnt++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: proc at position %d is not on object %d", - j, i); - continue; + "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&objs); + return rc; + } + /* flag that one was mapped */ + all_done = false; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + /* move to next object */ + break; } - proc->name.vpid = vpid++; - if (0 == cnt) { - app->first_rank = proc->name.vpid; - } - cnt++; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* flag that one was mapped */ - all_done = false; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - /* move to next object */ - break; } } } + /* cleanup */ + OBJ_DESTRUCT(&objs); } - - /* cleanup */ - OBJ_DESTRUCT(&objs); - return ORTE_SUCCESS; } -int orte_rmaps_base_compute_vpids(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes) +int orte_rmaps_base_compute_vpids(orte_job_t *jdata) { orte_job_map_t *map; + orte_app_context_t *app; orte_vpid_t vpid; - int j, cnt; + int j, m, n, cnt; orte_node_t *node; - orte_proc_t *proc; + orte_proc_t *proc, *pptr; int rc; - opal_list_item_t *item; bool one_found; map = jdata->map; @@ -445,7 +477,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by NUMA for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_NODE, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_NODE, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -460,7 +492,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by socket for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_SOCKET, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_SOCKET, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -475,7 +507,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L3cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 3))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 3))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -490,7 +522,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L2cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 2))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 2))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -505,7 +537,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by L1cache for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CACHE, 1))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CACHE, 1))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -520,7 +552,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by core for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_CORE, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_CORE, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -528,6 +560,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, } ORTE_ERROR_LOG(rc); } + opal_output(0, "DONE"); return rc; } @@ -535,7 +568,7 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: computing ranks by hwthread for job %s", ORTE_JOBID_PRINT(jdata->jobid)); - if (ORTE_SUCCESS != (rc = rank_by(jdata, app, nodes, HWLOC_OBJ_PU, 0))) { + if (ORTE_SUCCESS != (rc = rank_by(jdata, HWLOC_OBJ_PU, 0))) { if (ORTE_ERR_NOT_SUPPORTED == rc && !(ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(map->ranking))) { ORTE_SET_RANKING_POLICY(map->ranking, ORTE_RANK_BY_SLOT); @@ -549,26 +582,83 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, if (ORTE_RANK_BY_NODE == ORTE_GET_RANKING_POLICY(map->ranking) || ORTE_RANK_BY_BOARD == ORTE_GET_RANKING_POLICY(map->ranking)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by node for job %s app %d on %d nodes", - ORTE_JOBID_PRINT(jdata->jobid), (int)app->idx, - (int)opal_list_get_size(nodes)); - /* bozo check */ - if (0 == opal_list_get_size(nodes)) { - ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); - return ORTE_ERR_BAD_PARAM; - } + "mca:rmaps:base: computing vpids by node for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); /* assign the ranks round-robin across nodes - only one board/node * at this time, so they are equivalent */ - cnt=0; - vpid=jdata->num_procs; - one_found = true; - while (cnt < app->num_procs && one_found) { - one_found = false; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; + vpid=0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + cnt=0; + one_found = true; + while (cnt < app->num_procs && one_found) { + one_found = false; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + if (ORTE_VPID_INVALID != proc->name.vpid) { + continue; + } + proc->name.vpid = vpid++; + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { + ORTE_ERROR_LOG(rc); + return rc; + } + cnt++; + one_found = true; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; + break; /* move on to next node */ + } + } + } + if (cnt < app->num_procs) { + ORTE_ERROR_LOG(ORTE_ERR_FATAL); + return ORTE_ERR_FATAL; + } + } + return ORTE_SUCCESS; + } + + rankbyslot: + if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) { + /* assign the ranks sequentially */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:base: computing vpids by slot for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + vpid = 0; + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; @@ -581,70 +671,25 @@ int orte_rmaps_base_compute_vpids(orte_job_t *jdata, if (proc->app_idx != app->idx) { continue; } - if (ORTE_VPID_INVALID != proc->name.vpid) { - continue; + if (ORTE_VPID_INVALID == proc->name.vpid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:base: assigning rank %s to node %s", + ORTE_VPID_PRINT(vpid), node->name); + proc->name.vpid = vpid++; + /* track where the highest vpid landed - this is our + * new bookmark + */ + jdata->bookmark = node; } - proc->name.vpid = vpid++; - /* insert the proc into the jdata array - no harm if already there */ + /* insert the proc into the jdata array */ + if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { + OBJ_RELEASE(pptr); + } + OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } - cnt++; - one_found = true; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - break; /* move on to next node */ - } - } - } - if (cnt < app->num_procs) { - ORTE_ERROR_LOG(ORTE_ERR_FATAL); - return ORTE_ERR_FATAL; - } - return ORTE_SUCCESS; - } - - rankbyslot: - if (ORTE_RANK_BY_SLOT == ORTE_GET_RANKING_POLICY(map->ranking)) { - /* assign the ranks sequentially */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: computing vpids by slot for job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - vpid = jdata->num_procs; - for (item = opal_list_get_first(nodes); - item != opal_list_get_end(nodes); - item = opal_list_get_next(item)) { - node = (orte_node_t*)item; - - for (j=0; j < node->procs->size; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - continue; - } - /* ignore procs from other apps */ - if (proc->app_idx != app->idx) { - continue; - } - if (ORTE_VPID_INVALID == proc->name.vpid) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:base: assigning rank %s to node %s", - ORTE_VPID_PRINT(vpid), node->name); - proc->name.vpid = vpid++; - /* track where the highest vpid landed - this is our - * new bookmark - */ - jdata->bookmark = node; - } - /* insert the proc into the jdata array - no harm if already there */ - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { - ORTE_ERROR_LOG(rc); - return rc; } } } diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index b9003c93f59..cf8b9b71f69 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -351,6 +351,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* the list is empty - if the HNP is allocated, then add it */ if (orte_hnp_is_allocated) { nd = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); + OBJ_RETAIN(nd); opal_list_append(allocated_nodes, &nd->super); } else { nd = NULL; @@ -476,8 +477,7 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr /* if the hnp was not allocated, or flagged not to be used, * then remove it here */ if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { - node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); - if (node == (orte_node_t*)item) { + if (0 == node->index) { opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ item = next; @@ -508,24 +508,24 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr continue; } if (node->slots > node->slots_inuse) { - /* add the available slots */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s has %d slots available", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name, node->slots - node->slots_inuse)); - num_slots += node->slots - node->slots_inuse; - item = next; - continue; + /* add the available slots */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s has %d slots available", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name, node->slots - node->slots_inuse)); + num_slots += node->slots - node->slots_inuse; + item = next; + continue; } if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { - /* nothing needed to do here - we don't add slots to the - * count as we don't have any available. Just let the mapper - * do what it needs to do to meet the request - */ - OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, - "%s node %s is fully used, but available for oversubscription", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - node->name)); + /* nothing needed to do here - we don't add slots to the + * count as we don't have any available. Just let the mapper + * do what it needs to do to meet the request + */ + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, + "%s node %s is fully used, but available for oversubscription", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); } else { /* if we cannot use it, remove it from list */ opal_list_remove_item(allocated_nodes, item); diff --git a/orte/mca/rmaps/base/rmaps_private.h b/orte/mca/rmaps/base/rmaps_private.h index 8950a1b76df..d9e7f9dcfe0 100644 --- a/orte/mca/rmaps/base/rmaps_private.h +++ b/orte/mca/rmaps/base/rmaps_private.h @@ -12,6 +12,7 @@ * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,9 +56,7 @@ ORTE_DECLSPEC orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata, ORTE_DECLSPEC orte_node_t* orte_rmaps_base_get_starting_point(opal_list_t *node_list, orte_job_t *jdata); -ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata, - orte_app_context_t *app, - opal_list_t *nodes); +ORTE_DECLSPEC int orte_rmaps_base_compute_vpids(orte_job_t *jdata); ORTE_DECLSPEC int orte_rmaps_base_compute_local_ranks(orte_job_t *jdata); diff --git a/orte/mca/rmaps/lama/.opal_ignore b/orte/mca/rmaps/lama/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/orte/mca/rmaps/lama/Makefile.am b/orte/mca/rmaps/lama/Makefile.am deleted file mode 100644 index 0512f8b10da..00000000000 --- a/orte/mca/rmaps/lama/Makefile.am +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. -# -# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -dist_ortedata_DATA = help-orte-rmaps-lama.txt - -sources = \ - rmaps_lama_module.c \ - rmaps_lama_max_tree.c \ - rmaps_lama_params.c \ - rmaps_lama.h \ - rmaps_lama_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_orte_rmaps_lama_DSO -component_noinst = -component_install = mca_rmaps_lama.la -else -component_noinst = libmca_rmaps_lama.la -component_install = -endif - -mcacomponentdir = $(ortelibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_rmaps_lama_la_SOURCES = $(sources) -mca_rmaps_lama_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_rmaps_lama_la_SOURCES =$(sources) -libmca_rmaps_lama_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt b/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt deleted file mode 100644 index f1b7239bb4f..00000000000 --- a/orte/mca/rmaps/lama/help-orte-rmaps-lama.txt +++ /dev/null @@ -1,173 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. -# Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. -# -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English general help file for LAMA Mapper. -# -[orte-rmaps-lama:multi-apps-and-zero-np] -RMAPS found multiple applications to be launched, with at least one that failed -to specify the number of processes to execute. When specifying multiple -applications, you must specify how many processes of each to launch via the --np argument. -# -[orte-rmaps-lama:oversubscribe] -RMaps LAMA detected oversubscription after mapping %d of %d processes. -Since you have asked not to oversubscribe the resources the job will not -be launched. If you would instead like to oversubscribe the resources -try using the --oversubscribe option to mpirun. -# -[orte-rmaps-lama:no-resources-available] -RMaps LAMA detected that there are not enough resources to map the -remainder of the job. Check the command line options, and the number of -nodes allocated to this job. - Application Context : %d - # of Processes Successfully Mapped: %d - # of Processes Requested : %d - Mapping : %s - Binding : %s - MPPR : %s - Ordering : %s -# -[orte-rmaps-lama:merge-conflict-bad-prune-src] -RMaps LAMA detected that it needed to prune a level of the hierarchy that -was necessary for one of the command line parameters. Check your allocation -and the options below to make sure they are correct. - Conflicting Level Description: %s - Mapping : %s - Binding : %s - MPPR : %s - Ordering : %s -# -[invalid mapping option] -The specified mapping option is not supported with the LAMA rmaps -mapper: - - Specified mapping option: %s - Reason it is invalid: %s - -LAMA supports the following options to the mpirun --map-by option: - - node, numa, socket, l1cache, l2cache, l3cache, core, hwthread, slot - -Alternatively, LAMA supports specifying a sequence of letters in the -rmaps_lama_map MCA parameter; each letter indicates a "direction" for -mapping. The rmaps_lama_map MCA parameter is richer/more flexible -than the --may-by CLI option. If rmaps_lama_map is specified, the -following letters must be specified: - - h: hardware thread - c: processor core - s: processor socket - n: node (server) - -The following may also optionally be included in the mapping string: - - N: NUMA node - L1: L1 cache - L2: L2 cache - L3: L3 cache - -For example, the two commands below are equivalent: - - mpirun --mca rmaps lama --mca rmaps_lama_map csNh ... - mpirun --mca rmaps lama --map-by core ... -# -[invalid binding option] -The specified binding option is not supported with the LAMA rmaps -mapper: - - Specified binding option: %s - Reason it is invalid: %s - -LAMA binding options can be specified via the mpirun --bind-to command -line option or rmaps_lama_bind MCA param: - - --bind-to rmaps_lama_binding - Locality option option - ---------------- --------- ------------------ - Hardware thread hwthread h - Processor core core c - Processor socket socket s - NUMA node numa N - L1 cache l1cache L1 - L2 cache l2cache L2 - L3 cache l3cache L3 - Node (server) node n - -The --bind-to option assumes a single locality (e.g., bind each MPI -process to a single core, socket, etc.). The rmaps_lama_bind MCA -param requires an integer specifying how many localities to which to -bind. For example, the following two command lines are equivalent, -and bind each MPI process to a single core: - - mpirun --btl rmaps lama --mca rmaps_lama_bind 1c ... - mpirun --btl rmaps lama --bind-to core ... - -The rmaps_lama_bind MCA parameter is more flexible than the --bind-to -CLI option, because it allows binding to multiple resources. For -example, specifing an rmaps_lama_bind value of "2c" binds each MPI -process to two cores. -# -[invalid ordering option] -The specified ordering option is not supported. - - Specified ordering option: %s - -The LAMA ordering can be specified via the rmaps_lama_ordering MCA -parameter. - -Two options are supported for ordering ranks in MPI_COMM_WORLD (MCW): - - s: Sequential. MCW rank ordering is sequential by hardware thread - across all nodes. E.g., MCW rank 0 is the first process on node - 0; MCW rank 1 is the second process on node 0, and so on. - n: Natural. MCW rank ordering follows the "natural" mapping layout. - For example, in a by-socket layout, MCW rank 0 is the first - process on the 1st socket on node 0. MCW rank 1 is then the - first process on the 2nd socket on node 0. And so on. -# -[invalid mppr option] -The specified Max Processes Per Resource (MPPR) value is invalid (in -the rmaps_lama_mppr MCA paramter): - - Specified MPPR: %s - Reason is is invalid: %s - -The MPPR is a comma-delimited list of specifications indicating how -many processes are allowed on a given type of resource before an MPI -job is considered to have oversubscribed that resource. Each -specification is a token in the format of "NUMBER:RESOURCE". For -example, the default MPPR of "1:c" means that Open MPI will map one -process per processor core before considering cores to be -oversubscribed. - -Multiple specifications may be useful; for example "1:c,2:s" maintains -the default one-process-per-core limitation, but places an additional -limitation of only two processes per processor socket (assuming that -there are more than two cores per socket). - -The LAMA MPPR specifications are set via the rmaps_lama_mppr MCA -parameter. The following resources can be specified: - - Hardware thread h - Processor core c - Processor socket s - NUMA node N - L1 cache L1 - L2 cache L2 - L3 cache L3 - Node (server) n -# -[internal error] -An unexpected internal error occurred in the LAMA mapper; your job -will now fail. Sorry. - - File: %s - Message: %s diff --git a/orte/mca/rmaps/lama/owner.txt b/orte/mca/rmaps/lama/owner.txt deleted file mode 100644 index 0cc0384f0eb..00000000000 --- a/orte/mca/rmaps/lama/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: CISCO -status: maintenance diff --git a/orte/mca/rmaps/lama/rmaps_lama.h b/orte/mca/rmaps/lama/rmaps_lama.h deleted file mode 100644 index 8cb830f861e..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2013-2017 Cisco Systems, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * - * Resource Mapping - */ -#ifndef ORTE_RMAPS_LAMA_H -#define ORTE_RMAPS_LAMA_H - -#include "orte_config.h" - -#include "opal/class/opal_tree.h" - -#include "orte/mca/rmaps/rmaps.h" - -BEGIN_C_DECLS - -ORTE_MODULE_DECLSPEC extern orte_rmaps_base_component_t mca_rmaps_lama_component; - -extern orte_rmaps_base_module_t orte_rmaps_lama_module; - - -/********************************* - * Structures & Defines - *********************************/ -/* - * JJH: Can we reuse the opal_hwloc_level_t data structure in - * opal/mca/hwloc/hwloc-internal.h - */ -typedef enum { - LAMA_LEVEL_MACHINE = 0, - LAMA_LEVEL_BOARD = 1, - LAMA_LEVEL_NUMA = 2, - LAMA_LEVEL_SOCKET = 3, - LAMA_LEVEL_CACHE_L3 = 4, - LAMA_LEVEL_CACHE_L2 = 5, - LAMA_LEVEL_CACHE_L1 = 6, - LAMA_LEVEL_CORE = 7, - LAMA_LEVEL_PU = 8, - LAMA_LEVEL_UNKNOWN = 9 -} rmaps_lama_level_type_t; - -typedef enum { - LAMA_ORDER_NATURAL = 0, - LAMA_ORDER_SEQ = 1 -} rmaps_lama_order_type_t; - -struct rmaps_lama_level_info_t { - rmaps_lama_level_type_t type; - int max_resources; -}; -typedef struct rmaps_lama_level_info_t rmaps_lama_level_info_t; - -/* - * Structure to attach to the hwloc tree - * Accounting for mppr - */ -struct rmaps_lama_hwloc_user_t { - opal_object_t super; - - opal_pointer_array_t *node_mppr; -}; -typedef struct rmaps_lama_hwloc_user_t rmaps_lama_hwloc_user_t; -OBJ_CLASS_DECLARATION(rmaps_lama_hwloc_user_t); - -struct rmaps_lama_node_mppr_t { - int max; - int cur; -}; -typedef struct rmaps_lama_node_mppr_t rmaps_lama_node_mppr_t; - -rmaps_lama_level_type_t lama_type_str_to_enum(char *param); -char * lama_type_enum_to_str(rmaps_lama_level_type_t param); - - -/********************************* - * Command Line Interface Parsing - *********************************/ -/* - * User defined command line interface (CLI) arguments - */ -extern char * rmaps_lama_cmd_map; -extern char * rmaps_lama_cmd_bind; -extern char * rmaps_lama_cmd_mppr; -extern char * rmaps_lama_cmd_ordering; -extern bool rmaps_lama_timing_enabled; -extern bool rmaps_lama_can_oversubscribe; -extern bool rmaps_lama_am_oversubscribing; - -/* - * Internal representations of command line arguments - */ -extern int lama_mapping_num_layouts; -extern rmaps_lama_level_type_t *lama_mapping_layout; - -extern rmaps_lama_level_type_t lama_binding_level; - -extern rmaps_lama_level_info_t *lama_mppr_levels; -extern int lama_mppr_num_levels; - -/* - * Homogeneous system optimization - */ -extern bool lama_mppr_max_tree_homogeneous_system; - -/* - * Maximum length of digits in CLI - */ -#define MAX_BIND_DIGIT_LEN 4 - -int rmaps_lama_process_alias_params(orte_job_t *jdata); - -int rmaps_lama_parse_mapping(char *layout, - rmaps_lama_level_type_t **layout_types, - rmaps_lama_level_type_t **layout_types_sorted, - int *num_types); -int rmaps_lama_parse_binding(char *layout, - rmaps_lama_level_type_t *binding_level, - int *num_types); -int rmaps_lama_parse_mppr(char *layout, - rmaps_lama_level_info_t **mppr_levels, - int *num_types); -int rmaps_lama_parse_ordering(char *layout, - rmaps_lama_order_type_t *order); - -bool rmaps_lama_ok_to_prune_level(rmaps_lama_level_type_t level); - -/********************************* - * Max Tree Structure - *********************************/ -struct rmaps_lama_max_tree_item_t { - opal_tree_item_t tree_element; - - rmaps_lama_level_type_t type; -}; -typedef struct rmaps_lama_max_tree_item_t rmaps_lama_max_tree_item_t; - - -/* - * Union all topologies into the max tree - */ -int rmaps_lama_build_max_tree(orte_job_t *jdata, opal_list_t *node_list, - opal_tree_t * max_tree, bool *is_homogeneous); - -/* - * Find a matching subtree - */ -hwloc_obj_t * rmaps_lama_find_nth_subtree_match(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - rmaps_lama_level_type_t lama_key); -hwloc_obj_t * rmaps_lama_find_parent(hwloc_topology_t hwloc_topo, - hwloc_obj_t *child_obj, - rmaps_lama_level_type_t lama_key); - -/* - * Create Empty Tree - */ -opal_tree_t * rmaps_lama_create_empty_max_tree(void); - -/* - * Pretty Print - */ -void rmaps_lama_max_tree_pretty_print_tree(opal_tree_t *tree); - -END_C_DECLS - -#endif /* ORTE_RMAPS_LAMA_H */ diff --git a/orte/mca/rmaps/lama/rmaps_lama_component.c b/orte/mca/rmaps/lama/rmaps_lama_component.c deleted file mode 100644 index e8734dbec64..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_component.c +++ /dev/null @@ -1,136 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/base/base.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -#include "rmaps_lama.h" - -/* - * Local functions - */ - -static int orte_rmaps_lama_register(void); -static int orte_rmaps_lama_query(mca_base_module_t **module, int *priority); - -static int module_priority; - -char * rmaps_lama_cmd_map = NULL; -char * rmaps_lama_cmd_bind = NULL; -char * rmaps_lama_cmd_mppr = NULL; -char * rmaps_lama_cmd_ordering = NULL; -bool rmaps_lama_timing_enabled = false; -bool rmaps_lama_can_oversubscribe = false; -bool rmaps_lama_am_oversubscribing = false; - -orte_rmaps_base_component_t mca_rmaps_lama_component = { - .base_version = { - ORTE_RMAPS_BASE_VERSION_2_0_0, - - .mca_component_name = "lama", - MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, - ORTE_RELEASE_VERSION), - .mca_query_component = orte_rmaps_lama_query, - .mca_register_component_params = orte_rmaps_lama_register, - }, - .base_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, -}; - - -static int orte_rmaps_lama_register(void) -{ - mca_base_component_t *c = &mca_rmaps_lama_component.base_version; - - /* JMS Artifically low for now */ - module_priority = 0; - (void) mca_base_component_var_register (c, "priority", "Priority of the LAMA rmaps component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &module_priority); - - rmaps_lama_timing_enabled = false; - (void) mca_base_component_var_register (c, "timing", - "Enable timing information. [Default = disabled]", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_timing_enabled); - - rmaps_lama_cmd_map = NULL; - (void) mca_base_component_var_register (c, "map", "LAMA Map: Process layout iteration ordering (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_map); - - rmaps_lama_cmd_bind = NULL; - (void) mca_base_component_var_register (c, "bind", "LAMA Bind: Bind to the specified number of resources (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_bind); - - rmaps_lama_cmd_mppr = NULL; - (void) mca_base_component_var_register (c, "mppr", "LAMA MPPR: Maximum number of the specified resources available (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_mppr); - - rmaps_lama_cmd_ordering = NULL; - (void) mca_base_component_var_register (c, "ordering", "LAMA Ordering: Ordering (s) sequential, (n) natural - Default: n (See documentation)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_5, - MCA_BASE_VAR_SCOPE_READONLY, - &rmaps_lama_cmd_ordering); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Priority %3d", - module_priority); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Map : %s", - (NULL == rmaps_lama_cmd_map) ? "NULL" : rmaps_lama_cmd_map); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bind : %s", - (NULL == rmaps_lama_cmd_bind) ? "NULL" : rmaps_lama_cmd_bind); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: MPPR : %s", - (NULL == rmaps_lama_cmd_mppr) ? "NULL" : rmaps_lama_cmd_mppr); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Order : %s", - (NULL == rmaps_lama_cmd_ordering) ? "NULL" : rmaps_lama_cmd_ordering); - - return ORTE_SUCCESS; -} - - -static int orte_rmaps_lama_query(mca_base_module_t **module, int *priority) -{ - /* Only run on the HNP */ - - *priority = module_priority; - *module = (mca_base_module_t *)&orte_rmaps_lama_module; - - return ORTE_SUCCESS; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_max_tree.c b/orte/mca/rmaps/lama/rmaps_lama_max_tree.c deleted file mode 100644 index a1183028b3b..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_max_tree.c +++ /dev/null @@ -1,1182 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * Max Tree Support Functions - * - */ -#include "rmaps_lama.h" - -#include "orte/util/show_help.h" - -#include "orte/mca/errmgr/errmgr.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -/********************************* - * Max Tree Construction - *********************************/ -/* - * Convert an hwloc tree to an opal_tree - */ -static int rmaps_lama_convert_hwloc_tree_to_opal_tree(opal_tree_t *opal_tree, - hwloc_topology_t *hwloc_topo); - -/* - * Convert an hwloc subtree to an opal subtree - */ -static int rmaps_lama_convert_hwloc_subtree(hwloc_obj_t obj, - opal_tree_item_t *parent_item); - -/* - * Convert LAMA key to HWLOC key/depth - */ -static int rmaps_lama_convert_lama_key_to_hwloc_key(rmaps_lama_level_type_t lama_key, - hwloc_obj_type_t *hwloc_key, int *depth); - -/* - * Convert HWLOC key/depth to LAMA key - */ -static int rmaps_lama_convert_hwloc_key_to_lama_key(hwloc_obj_type_t hwloc_key, int depth, - rmaps_lama_level_type_t *lama_key); - -/* - * Compare two HWLOC topologies for similar structure - */ -static int rmaps_lama_hwloc_compare_topos(hwloc_topology_t *left, hwloc_topology_t *right); -static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right); - -/* - * Merge two opal_trees - */ -static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *into_tree, - opal_tree_item_t *src_parent, opal_tree_item_t *into_parent); - -/* - * Prune the max tree to just those levels specified - */ -static int rmaps_lama_prune_max_tree(opal_tree_t *max_tree, opal_tree_item_t *parent_item); - -/* - * Annotate the hwloc tree for MPPR accounting - */ -static int rmaps_lama_annotate_node_for_mppr(orte_node_t *node, hwloc_obj_t obj); - -/* - * Access the MPPR for the specified key - */ -static int rmaps_lama_get_mppr_for_key(orte_node_t *node, rmaps_lama_level_type_t lama_key); - -/* - * Recursive core of nth_subtree_match - */ -static int rmaps_lama_find_nth_subtree_match_core(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - int *num_found, - hwloc_obj_type_t hwloc_key, - int depth, - hwloc_obj_t *cur_child); - -static void rmaps_lama_max_tree_item_construct(rmaps_lama_max_tree_item_t *item) -{ - item->type = LAMA_LEVEL_UNKNOWN; -} - - -/********************************* - * Max Tree Accessors/Functions - *********************************/ -OBJ_CLASS_INSTANCE(rmaps_lama_max_tree_item_t, - opal_tree_item_t, - rmaps_lama_max_tree_item_construct, NULL); - -static int lama_max_tree_comp(opal_tree_item_t *item, void *key); -static int lama_max_tree_serialize(opal_tree_item_t *item, opal_buffer_t *buffer); -static int lama_max_tree_deserialize(opal_buffer_t *buffer, opal_tree_item_t **item); -static void * lama_max_tree_get_key(opal_tree_item_t *item); - - -/********************************* - * Max Tree Pretty Print - *********************************/ -static char * rmaps_lama_max_tree_pretty_print_subtree_element_get(opal_tree_t *tree, - opal_tree_item_t *parent, - int level); -static void pretty_print_subtree(opal_tree_t *tree, opal_tree_item_t *parent, int level); -static void pretty_print_subtree_element(opal_tree_t *tree, opal_tree_item_t *parent, int level); - - -/********************************* - * Function Defintions - *********************************/ -int rmaps_lama_build_max_tree(orte_job_t *jdata, opal_list_t *node_list, - opal_tree_t * max_tree, bool *is_homogeneous) -{ - int ret; - opal_tree_t *tmp_tree = NULL; - hwloc_topology_t topo, *last_topo = NULL; - orte_node_t *cur_node = NULL; - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Building the Max Tree..."); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - /* - * Assume homogeneous system, unless otherwise noted - */ - *is_homogeneous = true; - - /* - * Process all other unique trees from remote daemons who are in - * this allocation - */ - for(cur_node = (orte_node_t*)opal_list_get_first(node_list); - cur_node != (orte_node_t*)opal_list_get_end(node_list); - cur_node = (orte_node_t*)opal_list_get_next(cur_node) ) { - if (NULL == (topo = cur_node->topology)) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- No Tree Available: %s (skipping)", cur_node->name); - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Converting Remote Tree: %s", cur_node->name); - - /* - * Convert to opal_tree - */ - tmp_tree = rmaps_lama_create_empty_max_tree(); - rmaps_lama_convert_hwloc_tree_to_opal_tree(tmp_tree, &topo); - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - rmaps_lama_max_tree_pretty_print_tree(tmp_tree); - } - - /* - * Compare the current and last topologies if we are still considering - * this max tree to represent a homogeneous system. - */ - if( *is_homogeneous ) { - if( NULL == last_topo ) { - last_topo = &topo; - } else { - if( 0 != rmaps_lama_hwloc_compare_topos(last_topo, &topo) ) { - *is_homogeneous = false; - } - } - } - - /* - * Prune the input tree so that is only contains levels that the user - * asked for. - */ - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Pruning input Tree..."); - } - if( ORTE_SUCCESS != (ret = rmaps_lama_prune_max_tree(tmp_tree, opal_tree_get_root(tmp_tree))) ) { - return ret; - } - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Input Tree... - Post Prune"); - rmaps_lama_max_tree_pretty_print_tree(tmp_tree); - } - - /* - * Merge into max_tree - */ - if( opal_tree_is_empty(max_tree) ) { - opal_tree_dup(tmp_tree, max_tree); - } else { - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(tmp_tree, - max_tree, - opal_tree_get_root(tmp_tree), - opal_tree_get_root(max_tree) ))) { - return ret; - } - } - - /* - * Release and move on... - */ - OBJ_RELEASE(tmp_tree); - tmp_tree = NULL; - } - - - /* - * Fill out the MPPR accounting information for each node - */ - for(cur_node = (orte_node_t*)opal_list_get_first(node_list); - cur_node != (orte_node_t*)opal_list_get_end(node_list); - cur_node = (orte_node_t*)opal_list_get_next(cur_node) ) { - if( ORTE_SUCCESS != (ret = rmaps_lama_annotate_node_for_mppr(cur_node, - hwloc_get_obj_by_depth(cur_node->topology, 0, 0))) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - } - - /* - * JJH: NEEDS TESTING - * Note: This check is in place, but not used at the moment due to lack of - * system availability. Pending system availability and further testing, - * just assume heterogeneous. - */ - *is_homogeneous = false; - - /* - * Display the final Max Tree - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Final Max Tree... - %s system", - (*is_homogeneous ? "Homogeneous" : "Heterogeneous") ); - if( 11 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - rmaps_lama_max_tree_pretty_print_tree(max_tree); - } - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_tree_to_opal_tree(opal_tree_t *opal_tree, hwloc_topology_t *hwloc_topo) -{ - hwloc_obj_t topo_root; - - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Converting Topology:"); - /* opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); */ - opal_dss.dump(0, *hwloc_topo, OPAL_HWLOC_TOPO); - } - - topo_root = hwloc_get_root_obj(*hwloc_topo); - - rmaps_lama_convert_hwloc_subtree(topo_root, - opal_tree_get_root(opal_tree)); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_subtree(hwloc_obj_t obj, - opal_tree_item_t *parent_item) -{ - rmaps_lama_max_tree_item_t *max_tree_item = NULL; - char * key_child_str = NULL; - char * key_parent_str = NULL; - - while (obj) { - /* - * Create new tree item - */ - max_tree_item = OBJ_NEW(rmaps_lama_max_tree_item_t); - - /* - * Convert the HWLOC object to the LAMA key - */ - rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, - obj->attr->cache.depth, - &(max_tree_item->type)); - - /* - * Append tree item to parent. Unless it is the same as the - * parent (L1 instruction vs data cache). JJH: Newer versions - * of hwloc can differentiate from the obj->attr->cache.type. - */ - if( NULL != obj->parent && - obj->parent->type == obj->type && - obj->parent->attr->cache.depth == obj->attr->cache.depth ) { - key_child_str = lama_type_enum_to_str(max_tree_item->type); - key_parent_str = lama_type_enum_to_str(((rmaps_lama_max_tree_item_t*)parent_item)->type); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Identical level detected: " - "Child [%s] vs Parent [%s]", - key_child_str, key_parent_str); - free(key_child_str); - free(key_parent_str); - - /* - * Add descendants if they exist - */ - if (obj->first_child) { - rmaps_lama_convert_hwloc_subtree(obj->first_child, - parent_item); - } - } else { - opal_tree_add_child(parent_item, &max_tree_item->tree_element); - - /* - * Add descendants if they exist - */ - if (obj->first_child) { - rmaps_lama_convert_hwloc_subtree(obj->first_child, - &max_tree_item->tree_element); - } - } - - /* - * Advance to next sibling - */ - obj = obj->next_sibling; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_annotate_node_for_mppr(orte_node_t *node, hwloc_obj_t obj) -{ - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - rmaps_lama_level_type_t lama_key; - opal_hwloc_topo_data_t *opal_hwloc_topo = NULL; - int i; - - /* - * Attach our user pointer to the topology, if it is not already there. - * We will fill it in as needed later. - * - * Note: opal/mca/hwloc/base/hwloc_base_util.c attaches their own object - * to the userdata. There is a pointer in that structure we can use without - * interfering with what OPAL is trying to do. - */ - if( NULL == obj->userdata ) { - /* Some objects may not have topo data associated with them - * JJH: This is memory leak :/ Fix. - */ - obj->userdata = (void*)OBJ_NEW(opal_hwloc_topo_data_t); - } - if( NULL != obj->userdata ) { - opal_hwloc_topo = (opal_hwloc_topo_data_t*)(obj->userdata); - - if( NULL == opal_hwloc_topo->userdata ) { - hwloc_userdata = OBJ_NEW(rmaps_lama_hwloc_user_t); - opal_hwloc_topo->userdata = hwloc_userdata; - } else { - hwloc_userdata = (rmaps_lama_hwloc_user_t*)(opal_hwloc_topo->userdata); - } - } - - - /* - * Add node information if it is not already there - */ - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - if( NULL == mppr_accounting ) { - /* - * Add MPPR accounting for this node associated with this object - */ - rmaps_lama_convert_hwloc_key_to_lama_key(obj->type, obj->attr->cache.depth, &lama_key); - - mppr_accounting = (rmaps_lama_node_mppr_t*)malloc(sizeof(rmaps_lama_node_mppr_t)); - mppr_accounting->max = rmaps_lama_get_mppr_for_key(node, lama_key); - mppr_accounting->cur = 0; - - opal_pointer_array_set_item(hwloc_userdata->node_mppr, node->index, mppr_accounting); - } - - - /* - * Decend tree - */ - for(i = 0; i < (int)obj->arity; ++i ) { - rmaps_lama_annotate_node_for_mppr(node, - obj->children[i]); - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_get_mppr_for_key(orte_node_t *node, rmaps_lama_level_type_t lama_key) -{ - int i; - - for( i = 0; i < lama_mppr_num_levels; ++i ) { - if( lama_key == lama_mppr_levels[i].type ) { - return lama_mppr_levels[i].max_resources; - } - } - - return -1; -} - -static int rmaps_lama_convert_lama_key_to_hwloc_key(rmaps_lama_level_type_t lama_key, hwloc_obj_type_t *hwloc_key, int *depth) -{ - *depth = 0; - - switch(lama_key) { - case LAMA_LEVEL_MACHINE: - *hwloc_key = HWLOC_OBJ_MACHINE; - break; - /* Note: HWLOC does not support boards */ -#if 0 - case LAMA_LEVEL_BOARD: - *hwloc_key = HWLOC_OBJ_MACHINE; - break; -#endif - case LAMA_LEVEL_SOCKET: - *hwloc_key = HWLOC_OBJ_SOCKET; - break; - case LAMA_LEVEL_CORE: - *hwloc_key = HWLOC_OBJ_CORE; - break; - case LAMA_LEVEL_PU: - *hwloc_key = HWLOC_OBJ_PU; - break; - case LAMA_LEVEL_CACHE_L1: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 1; - break; - case LAMA_LEVEL_CACHE_L2: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 2; - break; - case LAMA_LEVEL_CACHE_L3: - *hwloc_key = HWLOC_OBJ_CACHE; - *depth = 3; - break; - case LAMA_LEVEL_NUMA: - *hwloc_key = HWLOC_OBJ_NODE; - break; - default: - *hwloc_key = HWLOC_OBJ_TYPE_MAX; - break; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_convert_hwloc_key_to_lama_key(hwloc_obj_type_t hwloc_key, int depth, rmaps_lama_level_type_t *lama_key) -{ - switch(hwloc_key) { - case HWLOC_OBJ_MACHINE: - *lama_key = LAMA_LEVEL_MACHINE; - break; - /* Node: HWLOC does not support boards */ -#if 0 - case HWLOC_OBJ_BOARD: - *lama_key = LAMA_LEVEL_BOARD; - break; -#endif - case HWLOC_OBJ_SOCKET: - *lama_key = LAMA_LEVEL_SOCKET; - break; - case HWLOC_OBJ_CORE: - *lama_key = LAMA_LEVEL_CORE; - break; - case HWLOC_OBJ_PU: - *lama_key = LAMA_LEVEL_PU; - break; - case HWLOC_OBJ_CACHE: - if( 1 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L1; - } - else if( 2 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L2; - } - else if( 3 == depth ) { - *lama_key = LAMA_LEVEL_CACHE_L3; - } - else { - *lama_key = LAMA_LEVEL_UNKNOWN; - } - break; - case HWLOC_OBJ_NODE: - *lama_key = LAMA_LEVEL_NUMA; - break; - default: - *lama_key = LAMA_LEVEL_UNKNOWN; - break; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_hwloc_compare_topos(hwloc_topology_t *left, hwloc_topology_t *right) -{ - hwloc_obj_t left_root; - hwloc_obj_t right_root; - - /* - * Note: I hope that there is a 'better' way of doing this natively with - * HWLOC, but it is not obvious if they have the ability to compare - * topologies. So do a depth first comparison of the trees. - * You may be able to use the below: - * OPAL_EQUAL != opal_dss.compare(*last_topo, topo, OPAL_HWLOC_TOPO); - */ - - left_root = hwloc_get_obj_by_depth(*left, 0, 0); - right_root = hwloc_get_obj_by_depth(*right, 0, 0); - - return rmaps_lama_hwloc_compare_subtrees(left_root, right_root); -} - -static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right) -{ - int i, ret; - - /* - * Check Types - */ - if( 0 != (ret = hwloc_compare_types(left->type, right->type)) ) { - return ret; - } - - /* - * Check 'arity' at this level - */ - if( left->arity > right->arity ) { - return -1; - } - else if( left->arity < right->arity ) { - return 1; - } - - /* - * Check all subtrees - */ - for(i = 0; i < (int)left->arity; ++i ) { - if( 0 != (ret = rmaps_lama_hwloc_compare_subtrees(left->children[i], - right->children[i])) ) { - return ret; - } - } - - /* - * Subtree is the same if we get here - */ - return 0; -} - -static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *max_tree, - opal_tree_item_t *src_parent, opal_tree_item_t *max_parent) -{ - int ret, exit_status = ORTE_SUCCESS; - rmaps_lama_level_type_t *key_src, *key_max; - opal_tree_item_t *child_item = NULL, *max_grandparent = NULL; - opal_tree_item_t *max_child_item = NULL; - int num_max, num_src; - int i; - char *key_src_str = NULL; - char *key_max_str = NULL; -#if 1 - char *str = NULL; -#endif - - /* - * Basecase - */ - if( NULL == src_parent ) { - return ORTE_SUCCESS; - } - - key_src = (rmaps_lama_level_type_t*)src_tree->get_key(src_parent); - key_max = (rmaps_lama_level_type_t*)max_tree->get_key(max_parent); - - key_src_str = lama_type_enum_to_str(*key_src); - key_max_str = lama_type_enum_to_str(*key_max); - - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: CHECK: Merge Trees: Keys Src (%2d - %s) vs Max (%2d - %s)", - *key_src, key_src_str, *key_max, key_max_str); - } - - /* - * Make sure keys at this level match. - * - * JJH: Give up if they do not match. - * JJH: We should pick a victim and prune from the tree - * JJH: preferably from the 'native' tree. - */ - if( 0 != max_tree->comp(max_parent, src_tree->get_key(src_parent)) ) { - /* - * If the source conflicts due to cache, iterate to children to find a match. - * JJH: Double check this for different heterogenous systems - */ - if( LAMA_LEVEL_CACHE_L3 == *key_src || - LAMA_LEVEL_CACHE_L2 == *key_src || - LAMA_LEVEL_CACHE_L1 == *key_src || - LAMA_LEVEL_NUMA == *key_src ) { - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Merge Trees: " - "Src with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]", - *key_src, key_src_str, *key_max, key_max_str); - - /* - * If we are pruning a cache level, then check to make sure it is - * not important to the process layout. - */ - if( !rmaps_lama_ok_to_prune_level(*key_src) ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:merge-conflict-bad-prune-src", - true, - key_src_str, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * If the number of children at this pruned level was larger than - * the max tree arity at this level, then duplicate the max_tree - * element the approprate number of times - */ - max_grandparent = opal_tree_get_parent(max_parent); - num_max = opal_tree_num_children(max_grandparent); - num_src = opal_tree_num_children(src_parent); - - for(i = 0; i < (num_src - num_max); ++i ) { -#if 1 - str = rmaps_lama_max_tree_pretty_print_subtree_element_get(max_tree, max_parent, 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Merge: Appending child %s - post prune", - str); - free(str); -#endif - /* Duplicate max child subtree */ - opal_tree_copy_subtree(max_tree, max_parent, max_tree, max_grandparent); - } - - /* - * Iterate to children, until we find a match - */ - for(child_item = opal_tree_get_first_child(src_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item) ) { - - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree, - max_tree, - child_item, - max_parent)) ) { - exit_status = ret; - goto cleanup; - } - } - - exit_status = ORTE_SUCCESS; - goto cleanup; - } - /* - * If the max tree conflicts due to cache, then we need to prune the - * max tree until it matches. - * JJH: If we are pruning a level of the hierarchy then make sure we - * JJH: don't need it for the process layout. - */ - else if( LAMA_LEVEL_CACHE_L3 == *key_max || - LAMA_LEVEL_CACHE_L2 == *key_max || - LAMA_LEVEL_CACHE_L1 == *key_max || - LAMA_LEVEL_NUMA == *key_max ) { - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Warning: Merge Trees: " - "Max with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]", - *key_src, key_src_str, *key_max, key_max_str); - - /* - * If we are pruning a cache level, then check to make sure it is - * not important to the process layout. - */ - if( !rmaps_lama_ok_to_prune_level(*key_max) ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:merge-conflict-bad-prune-src", - true, - key_max_str, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } - - max_child_item = opal_tree_get_first_child(max_parent); - /* Prune parent */ - opal_tree_remove_item(max_tree, max_parent); - - /* Try again with child */ - exit_status = rmaps_lama_merge_trees(src_tree, - max_tree, - src_parent, - max_child_item); - goto cleanup; - } - - /* - * If we cannot resolve it, give up. - */ - opal_output(0, "mca:rmaps:lama: Error: Merge Trees: " - "Different Keys Src (%2d - %s) vs Max (%2d - %s) - Do not know how to resolve - give up!", - *key_src, key_src_str, *key_max, key_max_str); - - exit_status = ORTE_ERROR; - goto cleanup; - } - - num_max = opal_tree_num_children(max_parent); - num_src = opal_tree_num_children(src_parent); - - /* - * If the 'native' tree has more children than the 'max' tree. - * Add the missing children to the 'max' tree. - */ - if( num_max < num_src ) { - i = 0; - for(child_item = opal_tree_get_first_child(src_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item)) { - if(i >= num_max ) { -#if 1 - str = rmaps_lama_max_tree_pretty_print_subtree_element_get(src_tree, child_item, 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Merge: Appending child %s", - str); - free(str); -#endif - /* Add child's subtree to max */ - opal_tree_copy_subtree(src_tree, child_item, max_tree, max_parent); - } - ++i; - } - } - - /* - * Recursively search all children of 'native' tree. - * - * Note: Only need to add the children to the 'left-most' branch of the - * 'max' tree since that is the only branch that is searched during mapping. - * But do the whole thing for good measure. - */ - for( child_item = opal_tree_get_first_child(src_parent), - max_child_item = opal_tree_get_first_child(max_parent); - child_item != NULL; - child_item = opal_tree_get_next_sibling(child_item), - max_child_item = opal_tree_get_next_sibling(max_child_item) ) { - - if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree, - max_tree, - child_item, - max_child_item)) ) { - exit_status = ret; - goto cleanup; - } - } - - cleanup: - if( NULL != key_src_str ) { - free(key_src_str); - key_src_str = NULL; - } - - if( NULL != key_max_str ) { - free(key_max_str); - key_max_str = NULL; - } - - return exit_status; -} - -static int rmaps_lama_prune_max_tree(opal_tree_t *max_tree, opal_tree_item_t *parent_item) -{ - int ret; - opal_tree_item_t *child_item = NULL, *next_item; - int i; - bool found; - rmaps_lama_level_type_t *key_max; - char *tmp_str = NULL; - - /* - * Basecase - */ - if( NULL == parent_item ) { - return ORTE_SUCCESS; - } - - /* - * Recursively decend tree - Depth first - * Basecase: No children, loop skipped - */ - child_item = opal_tree_get_first_child(parent_item); - while( child_item != NULL ) { - /* Do this before the recursive call, since it might remove this - * child so we need to preserve a pointer to the next sibling. - */ - next_item = opal_tree_get_next_sibling(child_item); - - if( ORTE_SUCCESS != (ret = rmaps_lama_prune_max_tree(max_tree, - child_item)) ) { - return ret; - } - - child_item = next_item; - } - - key_max = (rmaps_lama_level_type_t*)max_tree->get_key(parent_item); - - /* - * Check keys against the user supplied layout - */ - found = false; - for(i = 0; i < lama_mapping_num_layouts; ++i ) { - if( 0 == max_tree->comp(parent_item, &lama_mapping_layout[i]) ) { - found = true; - break; - } - } - - if( !found ) { - if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - tmp_str = lama_type_enum_to_str(*key_max); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Before pruning %s", - tmp_str); - free(tmp_str); - rmaps_lama_max_tree_pretty_print_tree(max_tree); - } - - opal_tree_remove_item(max_tree, parent_item); - - return ORTE_SUCCESS; - } - - return ORTE_SUCCESS; -} - - -hwloc_obj_t * rmaps_lama_find_nth_subtree_match(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - rmaps_lama_level_type_t lama_key) -{ - hwloc_obj_t *cur_child = NULL; - hwloc_obj_type_t hwloc_key; - int depth; - int num_found; -#if 0 - char str[128]; -#endif - - cur_child = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - - /* - * Convert LAMA key to HWLOC key - */ - rmaps_lama_convert_lama_key_to_hwloc_key(lama_key, &hwloc_key, &depth); - - /* - * Decend tree looking for the n'th matching subtree - */ - num_found = -1; - rmaps_lama_find_nth_subtree_match_core(hwloc_topo, - parent_obj, - nth, - &num_found, - hwloc_key, - depth, - cur_child); - - /* - * Check to see if we found it - */ -#if 0 - hwloc_obj_snprintf(str, sizeof(str), hwloc_topo, *cur_child, "#", 0); - if( nth == num_found ) { - printf("--> FOUND : %-20s \t -- \t %2d of %2d\n", str, nth, num_found); - } - else { - printf("--> MISSING : %-20s \t -- \t %2d of %2d\n", str, nth, num_found); - } -#endif - - if( nth == num_found ) { - return cur_child; - } - else { - free(cur_child); - return NULL; - } -} - -static int rmaps_lama_find_nth_subtree_match_core(hwloc_topology_t hwloc_topo, - hwloc_obj_t parent_obj, - int nth, - int *num_found, - hwloc_obj_type_t hwloc_key, - int depth, - hwloc_obj_t *cur_child) -{ - unsigned i; - bool found = false; - -#if 0 - { - char str[128]; - hwloc_obj_snprintf(str, sizeof(str), hwloc_topo, parent_obj, "#", 0); - printf("--> Checking -- %-20s \t -- \t %2d of %2d\n", str, nth, *num_found); - } -#endif - - /* - * Check if the keys match - */ - if( hwloc_key == parent_obj->type ) { - if( HWLOC_OBJ_CACHE == parent_obj->type && - depth == (int)parent_obj->attr->cache.depth ) { - *num_found += 1; - found = true; - } else { - *num_found += 1; - found = true; - } - } - - /* - * Basecase: - * If we have found the correct item, return - */ - if( nth == *num_found ) { - *cur_child = parent_obj; - return ORTE_SUCCESS; - } - - /* - * Do no go any deeper in the tree than we have to - */ - if( !found ) { - for(i = 0; i < parent_obj->arity; ++i ) { - rmaps_lama_find_nth_subtree_match_core(hwloc_topo, - parent_obj->children[i], - nth, - num_found, - hwloc_key, - depth, - cur_child); - if( nth == *num_found ) { - return ORTE_SUCCESS; - } - } - } - - return ORTE_SUCCESS; -} - -hwloc_obj_t * rmaps_lama_find_parent(hwloc_topology_t hwloc_topo, - hwloc_obj_t *child_obj, - rmaps_lama_level_type_t lama_key) -{ - hwloc_obj_t *cur_parent = NULL; - hwloc_obj_type_t hwloc_key; - int depth; - - /* - * Convert LAMA key to HWLOC key - */ - rmaps_lama_convert_lama_key_to_hwloc_key(lama_key, &hwloc_key, &depth); - - /* - * Sanity check - */ - if( hwloc_key == (*child_obj)->type ) { - if( HWLOC_OBJ_CACHE == (*child_obj)->type && - depth == (int)(*child_obj)->attr->cache.depth ) { - return child_obj; - } else { - return child_obj; - } - } - - cur_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - if (NULL == cur_parent) { - return NULL; - } - - /* - * Accend tree to find mathing parent - */ - *cur_parent = (*child_obj)->parent; - while(NULL != *cur_parent ) { - if( hwloc_key == (*cur_parent)->type ) { - if( HWLOC_OBJ_CACHE == (*cur_parent)->type && - depth == (int)(*cur_parent)->attr->cache.depth ) { - return cur_parent; - } else { - return cur_parent; - } - } - - *cur_parent = (*cur_parent)->parent; - } - - free(cur_parent); - return NULL; -} - - -/********************************* - * Max Tree Structure Functions - *********************************/ -opal_tree_t * rmaps_lama_create_empty_max_tree(void) -{ - opal_tree_t *tmp_tree = NULL; - - tmp_tree = OBJ_NEW(opal_tree_t); - opal_tree_init(tmp_tree, - &lama_max_tree_comp, - &lama_max_tree_serialize, - &lama_max_tree_deserialize, - &lama_max_tree_get_key); - - return tmp_tree; -} - -static int lama_max_tree_comp(opal_tree_item_t *item, void *key) -{ - if( ((rmaps_lama_max_tree_item_t *)item)->type == *((rmaps_lama_level_type_t *)key) ) { - return 0; - } - - return -1; -} - -static int lama_max_tree_serialize(opal_tree_item_t *item, opal_buffer_t *buffer) -{ - opal_dss.pack(buffer, &(((rmaps_lama_max_tree_item_t *)item)->type), 1, OPAL_INT); - - return ORTE_SUCCESS; -} - -static int lama_max_tree_deserialize(opal_buffer_t *buffer, opal_tree_item_t **item) -{ - rmaps_lama_max_tree_item_t *element; - orte_std_cntr_t n = 1; - - element = OBJ_NEW(rmaps_lama_max_tree_item_t); - if( OPAL_SUCCESS == opal_dss.unpack(buffer, &(element->type), &n, OPAL_INT) ) { - *item = (opal_tree_item_t*)element; - } else { - *item = NULL; - } - - return ORTE_SUCCESS; -} - -static void * lama_max_tree_get_key(opal_tree_item_t *item) -{ - return &(((rmaps_lama_max_tree_item_t *)item)->type); -} - - -/********************************* - * Pretty Print Functions - *********************************/ -void rmaps_lama_max_tree_pretty_print_tree(opal_tree_t *tree) -{ - if( NULL == tree ) { - return; - } - - if( opal_tree_is_empty(tree) ) { - return; - } - - pretty_print_subtree(tree, opal_tree_get_root(tree), 0); - - return; -} - -static char * rmaps_lama_max_tree_pretty_print_subtree_element_get(opal_tree_t *tree, - opal_tree_item_t *parent, - int level) -{ - char *element_str = NULL; - char *spacer = NULL; - char *label = NULL; - rmaps_lama_level_type_t *type = NULL; - int i; - - if( NULL == parent ) { - return NULL; - } - - spacer = (char *)malloc(sizeof(char) * (level+1)); - for(i = 0; i < level; ++i ) { - spacer[i] = ' '; - } - spacer[level] = '\0'; - - type = (rmaps_lama_level_type_t *)(tree->get_key(parent)); - label = lama_type_enum_to_str(*type); - - asprintf(&element_str, "%s[%s \t : %3d, %3d", - spacer, label, - parent->opal_tree_num_children, parent->opal_tree_num_ancestors); - - free(spacer); - free(label); - - return element_str; -} - -static void pretty_print_subtree(opal_tree_t *tree, opal_tree_item_t *parent, int level) -{ - opal_tree_item_t *child = NULL; - - if( NULL == parent ) { - return; - } - - /* - * Display Self - */ - pretty_print_subtree_element(tree, parent, level); - - /* - * Depth-first display children - * Basecase; If no children - return - */ - level++; - for(child = opal_tree_get_first_child(parent); - child != NULL; - child = opal_tree_get_next_sibling(child) ) { - pretty_print_subtree(tree, child, level); - } - - return; - -} - -static void pretty_print_subtree_element(opal_tree_t *tree, opal_tree_item_t *parent, int level) -{ - char *element_str = NULL; - - if( NULL == parent ) { - return; - } - - element_str = rmaps_lama_max_tree_pretty_print_subtree_element_get(tree, parent, level); - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Tree Element: %s", - element_str); - - free(element_str); - - return; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_module.c b/orte/mca/rmaps/lama/rmaps_lama_module.c deleted file mode 100644 index ceb97bf25b1..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_module.c +++ /dev/null @@ -1,1914 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * - * Copyright (c) 2012-2017 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2014 Intel, Inc. All rights reserved - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "orte_config.h" -#include "orte/constants.h" -#include "orte/types.h" - -#include -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H */ -#include - -#include "opal/mca/hwloc/hwloc-internal.h" - -#include "opal/util/argv.h" -#include "opal/class/opal_tree.h" - -#include "orte/util/show_help.h" -#include "orte/util/error_strings.h" - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" - -#include "orte/runtime/orte_globals.h" - -#include "rmaps_lama.h" - -#include MCA_timer_IMPLEMENTATION_HEADER - - -/********************************* - * Module setup - *********************************/ -static int orte_rmaps_lama_map(orte_job_t *jdata); -orte_rmaps_base_module_t orte_rmaps_lama_module = { - orte_rmaps_lama_map -}; - - -/********************************* - * Timer - *********************************/ -#define RMAPS_LAMA_TIMER_TOTAL 0 -#define RMAPS_LAMA_TIMER_PARSE_PARAMS 1 -#define RMAPS_LAMA_TIMER_BUILD_MAX_TREE 2 -#define RMAPS_LAMA_TIMER_MAPPING 3 -#define RMAPS_LAMA_TIMER_ORDERING 4 -#define RMAPS_LAMA_TIMER_MAX 5 - -static double rmaps_lama_get_time(void); -static void rmaps_lama_set_time(int idx, bool is_start); -static void rmaps_lama_display_all_timers(void); -static void rmaps_lama_clear_timers(void); -static void rmaps_lama_display_indv_timer_core(double diff, char *str); - -static double timer_start[RMAPS_LAMA_TIMER_MAX]; -static double timer_end[RMAPS_LAMA_TIMER_MAX]; -static double timer_accum[RMAPS_LAMA_TIMER_MAX]; - -#define RMAPS_LAMA_CLEAR_TIMERS() \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_clear_timers(); \ - } \ - } -#define RMAPS_LAMA_START_TIMER(idx) \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_set_time(idx, true); \ - } \ - } -#define RMAPS_LAMA_END_TIMER(idx) \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_set_time(idx, false); \ - } \ - } -#define RMAPS_LAMA_DISPLAY_TIMERS() \ - { \ - if( rmaps_lama_timing_enabled ) { \ - rmaps_lama_display_all_timers(); \ - } \ - } - - -/********************************* - * Structures & Defines - *********************************/ -static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item); -static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item); - -OBJ_CLASS_INSTANCE(rmaps_lama_hwloc_user_t, - opal_object_t, - rmaps_lama_hwloc_user_construct, - rmaps_lama_hwloc_user_destruct); - - -/********************************* - * Globals - *********************************/ -/* - * Mapping - */ -rmaps_lama_level_type_t *lama_mapping_layout = NULL; -static rmaps_lama_level_type_t *lama_mapping_layout_sort = NULL; -int lama_mapping_num_layouts = 0; - -/* - * Binding - */ -rmaps_lama_level_type_t lama_binding_level = LAMA_LEVEL_UNKNOWN; -static int lama_binding_num_levels = 0; - -/* - * MPPR - */ -rmaps_lama_level_info_t *lama_mppr_levels = NULL; -int lama_mppr_num_levels = 0; - -/* - * Ordering - */ -static rmaps_lama_order_type_t lama_ordering = LAMA_ORDER_NATURAL; - -/* - * Homogeneous system optimization - */ -bool lama_mppr_max_tree_homogeneous_system = false; - - -/********************************* - * Support Macros - *********************************/ - - -/********************************* - * Support functions - *********************************/ -/* - * Preprocess the command line arguments - */ -static int orte_rmaps_lama_process_params(orte_job_t *jdata); - -/* - * Mapping Support: - * Core mapping function - */ -static int orte_rmaps_lama_map_core(orte_job_t *jdata); - -/* - * Mapping Support: - * Recursive function for mapping process - */ -static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, - orte_app_context_t *cur_app_context, - opal_list_t *node_list, - orte_node_t **cur_mach_ptr, - opal_tree_t *max_tree, - int cur_level, - int mach_level, - int **pu_idx_ref, - int **last_pu_idx_ref, - int *num_mapped, - int max_procs, - int *iter_passes); - -/* - * Mapping Support: - * Access the next machine in the node list - */ -static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, - opal_list_item_t *cur_mach); - -/* - * Mapping Support: - * Check the availability of the requested slot on the specified node - */ -static int check_node_availability(orte_node_t *cur_node, - opal_tree_t *max_tree, - int *pu_idx_ref, - char **slot_list); - -/* - * Mapping Support: - * Debugging PU display - */ -static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc); -static char * pu_ref_to_str(int *ref, int size); - -/* - * Mapping Support: - * Convert the process layout 'layer' to the sorted position for the PU - */ -static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer); - -/* - * MPPR Support: - * Check to make sure a process can be placed on this resource given the - * MPPR restrictions. - */ -static int rmaps_lama_check_mppr(orte_node_t *node, - hwloc_obj_t *child_obj); -static int rmaps_lama_iter_mppr_parents(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only); -static int rmaps_lama_iter_mppr_children(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only); - -/* - * MPPR Support: - * Increment parents of this child to account for a process being placed - * on this resource. - */ -static int rmaps_lama_inc_mppr(orte_node_t *node, - hwloc_obj_t *child_obj); - -/* - * Mapping Support: - * Return the native representation of the slot list - */ -static char * get_native_slot_list(orte_node_t *cur_node, - hwloc_obj_t *pu_obj, - int *put_idx_ref); - -/* - * Ordering Support: - * Reorder sequentially - */ -static int rmaps_lama_ordering_sequential(orte_job_t *jdata); - -/* - * Map a single process to a specific node - */ -static int orte_rmaps_lama_map_process(orte_job_t *jdata, - orte_node_t *node, - int app_idx, - orte_proc_t **proc); - -/********************************* - * Main Module function to map a job - *********************************/ -static int orte_rmaps_lama_map(orte_job_t *jdata) -{ - int ret, exit_status = ORTE_SUCCESS; - mca_base_component_t *loc_comp = &mca_rmaps_lama_component.base_version; - - RMAPS_LAMA_CLEAR_TIMERS(); - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_TOTAL); - - /* - * Sanity Check: - * If we are not the 'chosen' mapper, then exit here - */ - if (NULL != jdata->map->req_mapper && - 0 != strcasecmp(jdata->map->req_mapper, loc_comp->mca_component_name)) { - /* a mapper has been specified, and it isn't me */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: job %s not using lama mapper (using %s)", - ORTE_JOBID_PRINT(jdata->jobid), - jdata->map->req_mapper); - return ORTE_ERR_TAKE_NEXT_OPTION; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping job %s", - ORTE_JOBID_PRINT(jdata->jobid)); - - /* - * Identify this as the mapper responsible for this job - */ - if (NULL != jdata->map->last_mapper) { - free(jdata->map->last_mapper); - } - jdata->map->last_mapper = strdup(loc_comp->mca_component_name); - - /* - * Start at the beginning... - */ - jdata->num_procs = 0; - - /* - * Process the command line arguments - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); - if( ORTE_SUCCESS != (ret = orte_rmaps_lama_process_params(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_PARSE_PARAMS); - - /* - * Actually map the job - */ - if( ORTE_SUCCESS != (ret = orte_rmaps_lama_map_core(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * All Done - */ - - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_TOTAL); - RMAPS_LAMA_DISPLAY_TIMERS(); - - - cleanup: - if( NULL != lama_mapping_layout ) { - free(lama_mapping_layout); - lama_mapping_layout = NULL; - } - - if( NULL != lama_mapping_layout_sort ) { - free(lama_mapping_layout_sort); - lama_mapping_layout_sort = NULL; - } - - if( NULL != lama_mppr_levels ) { - free(lama_mppr_levels); - lama_mppr_levels = NULL; - } - - return exit_status; -} - - -/********************************* - * User defined lookup structure for hwloc topology - *********************************/ -static void rmaps_lama_hwloc_user_construct(rmaps_lama_hwloc_user_t *item) -{ - item->node_mppr = OBJ_NEW(opal_pointer_array_t); - opal_pointer_array_init(item->node_mppr, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE, - ORTE_GLOBAL_ARRAY_MAX_SIZE, - ORTE_GLOBAL_ARRAY_BLOCK_SIZE); -} - -static void rmaps_lama_hwloc_user_destruct(rmaps_lama_hwloc_user_t *item) -{ - orte_std_cntr_t i; - - if( NULL != item->node_mppr ) { - for(i = 0; i < item->node_mppr->size; ++i) { - if( NULL != item->node_mppr->addr[i] ) { - OBJ_RELEASE(item->node_mppr->addr[i]); - item->node_mppr->addr[i] = NULL; - } - } - OBJ_RELEASE(item->node_mppr); - item->node_mppr = NULL; - } -} - - -/********************************* - * Command line parameter parsing functions - *********************************/ -static int orte_rmaps_lama_process_params(orte_job_t *jdata) -{ - int ret, i; - char *type_str = NULL; - - /* - * Process map/bind/order/mppr aliases. It will print its own - * error message if something went wrong. - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_process_alias_params(jdata) ) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* - * Parse: Binding. It will print its own error message if - * something goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Binding : [%s]", - rmaps_lama_cmd_bind); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_binding(rmaps_lama_cmd_bind, - &lama_binding_level, - &lama_binding_num_levels)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - type_str = lama_type_enum_to_str(lama_binding_level); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Binding : %*d x %10s", - MAX_BIND_DIGIT_LEN, lama_binding_num_levels, type_str); - free(type_str); - type_str = NULL; - } - /* Reset the binding option since we are going to do it ourselves */ - OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE); - - /* - * Parse: Mapping from Process Layout string. It will print its - * own error message if something goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Mapping : [%s]", - rmaps_lama_cmd_map); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mapping(rmaps_lama_cmd_map, - &lama_mapping_layout, - &lama_mapping_layout_sort, - &lama_mapping_num_layouts)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - type_str = lama_type_enum_to_str(lama_mapping_layout[i]); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Mapping : (%d) %10s (%d vs %d)", - i, type_str, - lama_mapping_layout[i], lama_mapping_layout_sort[i]); - free(type_str); - type_str = NULL; - } - } - - /* - * Parse: MPPR. It will print its own error message if something - * goes wrong. - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- MPPR : [%s]", - rmaps_lama_cmd_mppr); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_mppr(rmaps_lama_cmd_mppr, - &lama_mppr_levels, - &lama_mppr_num_levels)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - for( i = 0; i < lama_mppr_num_levels; ++i ) { - type_str = lama_type_enum_to_str(lama_mppr_levels[i].type); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- MPPR : %*d at %10s", - MAX_BIND_DIGIT_LEN, lama_mppr_levels[i].max_resources, type_str); - free(type_str); - type_str = NULL; - } - } - - /* - * Parse: Ordering - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Ordering : [%s]", - rmaps_lama_cmd_ordering); - if( ORTE_SUCCESS != (ret = rmaps_lama_parse_ordering(rmaps_lama_cmd_ordering, - &lama_ordering)) ) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - if( LAMA_ORDER_NATURAL == lama_ordering ) { - type_str = strdup("Natural"); - } - else if( LAMA_ORDER_SEQ == lama_ordering ) { - type_str = strdup("Sequential"); - } - else { - type_str = strdup("Unknown"); - } - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ----- Ordering : %10s", - type_str); - free(type_str); - type_str = NULL; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - return ORTE_SUCCESS; -} - - -/********************************* - * Support functions - *********************************/ -rmaps_lama_level_type_t lama_type_str_to_enum(char *param) -{ - if( 0 == strncmp(param, "n", strlen("n")) ) { - return LAMA_LEVEL_MACHINE; - } - else if( 0 == strncmp(param, "b", strlen("b")) ) { - return LAMA_LEVEL_BOARD; - } - else if( 0 == strncmp(param, "s", strlen("s")) ) { - return LAMA_LEVEL_SOCKET; - } - else if( 0 == strncmp(param, "c", strlen("c")) ) { - return LAMA_LEVEL_CORE; - } - else if( 0 == strncmp(param, "h", strlen("h")) ) { - return LAMA_LEVEL_PU; - } - else if( 0 == strncmp(param, "L1", strlen("L1")) ) { - return LAMA_LEVEL_CACHE_L1; - } - else if( 0 == strncmp(param, "L2", strlen("L2")) ) { - return LAMA_LEVEL_CACHE_L2; - } - else if( 0 == strncmp(param, "L3", strlen("L3")) ) { - return LAMA_LEVEL_CACHE_L3; - } - else if( 0 == strncmp(param, "N", strlen("N")) ) { - return LAMA_LEVEL_NUMA; - } - - return LAMA_LEVEL_UNKNOWN; -} - -char * lama_type_enum_to_str(rmaps_lama_level_type_t param) -{ - if( LAMA_LEVEL_MACHINE == param ) { - return strdup("Machine"); - } - else if( LAMA_LEVEL_BOARD == param ) { - return strdup("Board"); - } - else if( LAMA_LEVEL_SOCKET == param ) { - return strdup("Socket"); - } - else if( LAMA_LEVEL_CORE == param ) { - return strdup("Core"); - } - else if( LAMA_LEVEL_PU == param ) { - return strdup("Hw. Thread"); - } - else if( LAMA_LEVEL_CACHE_L1 == param ) { - return strdup("L1 Cache"); - } - else if( LAMA_LEVEL_CACHE_L2 == param ) { - return strdup("L2 Cache"); - } - else if( LAMA_LEVEL_CACHE_L3 == param ) { - return strdup("L3 Cache"); - } - else if( LAMA_LEVEL_NUMA == param ) { - return strdup("NUMA"); - } - - return strdup("Unknown"); -} - -/********************************* - * Core Mapper function - *********************************/ -static int orte_rmaps_lama_map_core(orte_job_t *jdata) -{ - int ret, exit_status = ORTE_SUCCESS; - int cur_app_idx = 0; - int num_slots; - orte_app_context_t *cur_app_context = NULL; - orte_node_t *cur_mach = NULL; - orte_node_t **cur_mach_ptr = NULL; - orte_proc_t *proc = NULL; - opal_list_t *node_list = NULL; - opal_list_item_t *item = NULL; - opal_tree_t *max_tree = NULL; - int *pu_idx_ref = NULL; - int *last_pu_idx_ref = NULL; - int i, num_mapped, last_num_mapped, mach_level = -1; - orte_std_cntr_t j; - int max_procs_to_map; - int iter_passes; - char * last_level_str = NULL; - bool initial_map = true; - - /* - * Setup PU reference - * Find the position of the 'machine' - */ - pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); - if (NULL == pu_idx_ref) { - return ORTE_ERROR; - } - last_pu_idx_ref = (int*)malloc(sizeof(int) * lama_mapping_num_layouts); - if (NULL == last_pu_idx_ref) { - free(pu_idx_ref); - return ORTE_ERROR; - } - - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - pu_idx_ref[i] = 0; - last_pu_idx_ref[i] = -1; - if( LAMA_LEVEL_MACHINE == lama_mapping_layout[i] ) { - mach_level = i; - } - } - - /* - * Foreach app context - */ - for(cur_app_idx = 0; cur_app_idx < jdata->apps->size; ++cur_app_idx ) { - if( NULL == (cur_app_context = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, cur_app_idx))) { - continue; - } - - /* - * Get the list of nodes for this app_context. - */ - node_list = OBJ_NEW(opal_list_t); - ret = orte_rmaps_base_get_target_nodes(node_list, - &num_slots, - cur_app_context, - jdata->map->mapping, - initial_map, false); - if(ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - /* Flag that all subsequent requests should not reset the node->mapped flag */ - initial_map = false; - - /* - * If a bookmark exists from some prior mapping, then start from there - */ - cur_mach = (orte_node_t*)orte_rmaps_base_get_starting_point(node_list, jdata); - - /* - * If the application did not specify the number of procs - * then set it to the number of 'slots' - * JJH: TODO: Revisit 'max_procs' calculation - */ - if (0 == cur_app_context->num_procs) { - cur_app_context->num_procs = num_slots; - } - max_procs_to_map = cur_app_context->num_procs; - - /* - * Build the Max Tree - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); - max_tree = rmaps_lama_create_empty_max_tree(); - if( ORTE_SUCCESS != (ret = rmaps_lama_build_max_tree(jdata, node_list, - max_tree, - &lama_mppr_max_tree_homogeneous_system)) ) { - exit_status = ret; - goto cleanup; - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_BUILD_MAX_TREE); - - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: -----------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_MAPPING); - - /* - * Clear PU reference - */ - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - pu_idx_ref[i] = 0; - } - - /* - * Mapping: Recursively loop over all levels - */ - num_mapped = 0; - last_num_mapped = 0; - iter_passes = 0; - cur_mach_ptr = (orte_node_t**)malloc(sizeof(orte_node_t*)); - *cur_mach_ptr = cur_mach; - while( max_procs_to_map > num_mapped ) { - ret = rmaps_lama_map_core_iter_level(jdata, - cur_app_context, - node_list, - cur_mach_ptr, - max_tree, - lama_mapping_num_layouts-1, - mach_level, - &pu_idx_ref, - &last_pu_idx_ref, - &num_mapped, - max_procs_to_map, - &iter_passes); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - - /* - * We only get here (without finishing the mapping) if we are going to - * start oversubscribing resources. - */ - if( max_procs_to_map > num_mapped ) { - if( !rmaps_lama_can_oversubscribe ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:oversubscribe", - true, - num_mapped, max_procs_to_map); - exit_status = ORTE_ERROR; - goto cleanup; - } else { - rmaps_lama_am_oversubscribing = true; - } - } - - /* - * Check to see if we have made any progress in the mapping loop - */ - if( 0 < cur_app_idx && 2 == iter_passes ) { - /* - * Give it another pass: - * This is an edge case when we are trying to restart from a - * bookmark left by a previous app context. If this app context - * is starting from exactly the beginning of the allocation - * then the recursive loop could return out here after the - * increment pass. This is indicated by (iter_passes = 2). - * Since no processes were mapped, we just try again. - */ - } - else if( last_num_mapped == num_mapped ) { - orte_show_help("help-orte-rmaps-lama.txt", - "orte-rmaps-lama:no-resources-available", - true, - cur_app_idx, - num_mapped, max_procs_to_map, - (NULL == rmaps_lama_cmd_map ? "[Not Provided]" : rmaps_lama_cmd_map), - (NULL == rmaps_lama_cmd_bind ? "[Not Provided]" : rmaps_lama_cmd_bind), - (NULL == rmaps_lama_cmd_mppr ? "[Not Provided]" : rmaps_lama_cmd_mppr), - (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering)); - exit_status = ORTE_ERROR; - goto cleanup; - } else { - last_num_mapped = num_mapped; - } - } - - /* - * Display Bookmark for debugging - */ - last_level_str = pu_ref_to_str(last_pu_idx_ref, lama_mapping_num_layouts); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bookmark: --> Node %10s PU %10s", - jdata->bookmark->name, last_level_str); - free(last_level_str); - last_level_str = NULL; - - /* - * Clenup for next iteration - */ - if( NULL != node_list ) { - while(NULL != (item = opal_list_remove_first(node_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(node_list); - node_list = NULL; - } - - OBJ_RELEASE(max_tree); - max_tree = NULL; - } - - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_MAPPING); - - - /* - * Ordering - */ - RMAPS_LAMA_START_TIMER(RMAPS_LAMA_TIMER_ORDERING); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - if( LAMA_ORDER_SEQ == lama_ordering ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Sequential ------------"); - - if( ORTE_SUCCESS != (ret = rmaps_lama_ordering_sequential(jdata)) ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Natural ---------------"); -#if 0 - /* - * We compute our own vpids inline with the algorithm. So no need to use the - * orte_rmaps_base_compute_vpids() function. - */ -#endif - } - RMAPS_LAMA_END_TIMER(RMAPS_LAMA_TIMER_ORDERING); - - - /* - * Display Mapping - */ - if( 10 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) { - char *cpu_bitmap; - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - for( j = 0; j < jdata->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - cpu_bitmap = NULL; - orte_get_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, (void**)&cpu_bitmap, OPAL_STRING); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Proc. %2d on Node %10s - Slot %s", - proc->name.vpid, proc->node->name, cpu_bitmap); - if (NULL != cpu_bitmap) { - free(cpu_bitmap); - } - } - } - - - /* - * All done - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Finished ------------------------"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - - cleanup: - if( NULL != node_list ) { - while(NULL != (item = opal_list_remove_first(node_list))) { - OBJ_RELEASE(item); - } - OBJ_RELEASE(node_list); - } - - if( NULL != max_tree ) { - OBJ_RELEASE(max_tree); - } - - free(pu_idx_ref); - free(last_pu_idx_ref); - - if( NULL != last_level_str ) { - free(last_level_str); - } - - return exit_status; -} - -static int rmaps_lama_map_core_iter_level(orte_job_t *jdata, - orte_app_context_t *cur_app_context, - opal_list_t *node_list, - orte_node_t **cur_mach_ptr, - opal_tree_t *max_tree, - int cur_level, - int mach_level, - int **pu_idx_ref, - int **last_pu_idx_ref, - int *num_mapped, - int max_procs, - int *iter_passes) -{ - int ret, exit_status = ORTE_SUCCESS; - int i, j; - opal_tree_item_t *tree_for_level = NULL; - int max_subtree_arity = 0; - char * level_str = NULL; - char * last_level_str = NULL; - char * slot_list = NULL; - orte_proc_t *proc = NULL; - int pu_idx = 0; - - /* - * Find the current tree for this level - * If it is the machine level, then we need to access the information from - * the node list, not the max_tree. - */ - if( cur_level != mach_level ) { - tree_for_level = opal_tree_find_with(opal_tree_get_root(max_tree), - &lama_mapping_layout[cur_level]); - /* - * We do not need subtree, but the arity of the subtree - * JJH TODO: This should be an opal_tree function. - */ - max_subtree_arity = 1; /* include self */ - while( NULL != (tree_for_level = opal_tree_get_next_sibling(tree_for_level)) ) { - ++max_subtree_arity; - } - } - else if( NULL == *cur_mach_ptr ) { - *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); - } - - pu_idx = convert_layer_to_sort_idx(lama_mapping_layout[cur_level]); - level_str = lama_type_enum_to_str(lama_mapping_layout[cur_level]); - - /* - * Do we need to advance to a bookmark - */ - if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - /* - * Display last mapped - */ - last_level_str = pu_ref_to_str(*last_pu_idx_ref, lama_mapping_num_layouts); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bookmark: --> Last Mapped: Node %10s (bkmrk %10s) PU %10s - Level %2d", - (NULL == *cur_mach_ptr ? "(NULL)" : (*cur_mach_ptr)->name), - jdata->bookmark->name, last_level_str, (*last_pu_idx_ref)[pu_idx]); - free(last_level_str); - last_level_str = NULL; - - /* - * Set the level starting point to the last known index - */ - i = (*last_pu_idx_ref)[pu_idx]; - } else { - i = 0; - } - - - /* - * Loop over all siblings at this level - * Initial condition above, Increment at bottom, Break check at bottom - */ - while( 1 ) { - /* - * Define the PU index - */ - (*pu_idx_ref)[pu_idx] = i; - - if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s - Increment only", - cur_level+1, - level_str, pu_idx, i, max_subtree_arity, - (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); - } else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %10s (%2d) - I %2d - Arity %2d - %10s", - cur_level+1, - level_str, pu_idx, i, max_subtree_arity, - (NULL == *cur_mach_ptr ? "" : (*cur_mach_ptr)->name)); - } - - - /* - * If not the inner most loop, iterate to the next level down - */ - if( cur_level > 0 ) { - ret = rmaps_lama_map_core_iter_level(jdata, - cur_app_context, - node_list, - cur_mach_ptr, - max_tree, - cur_level - 1, - mach_level, - pu_idx_ref, - last_pu_idx_ref, - num_mapped, - max_procs, - iter_passes); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - } - /* - * If we are restarting the iteration from a previous bookmark then - * the first pass through is a no-op mapping pass that just increments - * the PU reference. - * Called by innermost loop - */ - else if( (*last_pu_idx_ref)[0] >= 0 && 0 == *iter_passes ) { - *iter_passes += 1; - } - /* - * Try to map at this location - */ - else { - /* - * On first pass, make sure we increment this, just so we do not - * accidentally think this is an increment pass. - */ - if( 0 == *iter_passes ) { - *iter_passes += 1; - } - - /* - * Display the PU ref for debugging - */ - display_pu_ref(*pu_idx_ref, lama_mapping_num_layouts, *num_mapped, proc); - - - /* - * Check to see if this resource is available on this node. - * - * In a heterogeneous or otherwise non-uniformly restricted - * environment we may iterate to a resource that is not - * available either because it does not exist, or is not - * available for allocation (off-lined, sub-node allocation). - * Additionally, we need to check resource constrains expressed - * in the MPPR and binding. - */ - ret = check_node_availability((*cur_mach_ptr), - max_tree, - *pu_idx_ref, - &slot_list); - if( ORTE_SUCCESS != ret || NULL == slot_list ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:hwtopo: Mapping: --> Level %2d: %s - INVALID/SKIP", - cur_level+1, - level_str); - /* - * By not mapping here we just let the iterations continue - * until a suitable match is found or we have exhausted all - * possible locations to match and thus cannot map any more. - */ - } - else { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: --> Level %2d: %s - Slot List (%s)", - cur_level+1, - level_str, slot_list); - - /* - * Map this process onto the resource specified - * level_tree_objs[*] and cur_mach point to the specific resource - */ - proc = NULL; - ret = orte_rmaps_lama_map_process(jdata, - (*cur_mach_ptr), - cur_app_context->idx, - &proc); - if( ORTE_SUCCESS != ret ) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto bailout; - } - - /* - * Set the binding for this process - */ - orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, slot_list, OPAL_STRING); - /* - * Insert the proc into the 'native' ordering location. - */ - proc->name.vpid = jdata->num_procs; - if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(jdata->procs, - proc->name.vpid, proc))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } - jdata->num_procs += 1; - - /* - * Save a bookmark so we can return here later if necessary - */ - for( j = 0; j < lama_mapping_num_layouts; ++j ) { - (*last_pu_idx_ref)[j] = (*pu_idx_ref)[j]; - } - jdata->bookmark = (orte_node_t*)(*cur_mach_ptr); - - (*num_mapped)++; - } - } - - /* - * Increment loop - * - * If we are binding, then we may need to advance the binding layer - * by more than one. - */ - if( cur_level != mach_level ) { - if( lama_binding_level == lama_mapping_layout[cur_level] ) { - i += lama_binding_num_levels; - } else { - ++i; - } - } else { - /* - * Note: Currently we do not allow for 'binding' to multiple machines - * But keep the code just in case we want to play with 'stride' later - */ - if( lama_binding_level == lama_mapping_layout[cur_level] && lama_binding_num_levels > 1) { - opal_output(0, "mca:rmaps:lama: ERROR: Cannot bind to multiple machines - SHOULD NEVER HAPPEN: %s", - rmaps_lama_cmd_bind); - exit_status = ORTE_ERROR; - goto bailout; -#if 0 - for( j = 0; j < lama_binding_num_levels; ++j ) { - cur_mach = get_next_machine(jdata, node_list, (opal_list_item_t*)cur_mach); - if( NULL == cur_mach ) { - break; - } - ++i; - } -#endif - } else { - *cur_mach_ptr = get_next_machine(jdata, node_list, (opal_list_item_t*)(*cur_mach_ptr)); - ++i; - } - } - - /* - * Check if we are done mapping before iterating again - */ - if( max_procs <= *num_mapped ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - /* - * Check if we are done looping - */ - if( cur_level != mach_level ) { - if( i >= max_subtree_arity ) { - break; - } - } else { - if( NULL == *cur_mach_ptr ) { - break; - } - } - } - - - /* - * Sanity Check: Check if we are done mapping - */ - if( max_procs <= *num_mapped ) { - exit_status = ORTE_SUCCESS; - goto cleanup; - } - - cleanup: - /* - * If the outermost layer, the increment the number of iteration passes. - */ - if( cur_level == lama_mapping_num_layouts-1 ) { - *iter_passes += 1; - } - - bailout: - if( NULL != level_str ) { - free(level_str); - level_str = NULL; - } - - if( NULL != slot_list ) { - free(slot_list); - slot_list = NULL; - } - - return exit_status; -} - -static orte_node_t* get_next_machine(orte_job_t *jdata, opal_list_t *node_list, - opal_list_item_t *cur_mach) -{ - orte_node_t *next_mach = NULL; - - if( NULL == cur_mach ) { - next_mach = (orte_node_t*)opal_list_get_first(node_list); - } - else if( opal_list_get_last(node_list) == cur_mach ) { - next_mach = NULL; - } - else { - next_mach = (orte_node_t*)opal_list_get_next(cur_mach); - } - - return next_mach; -} - -static int orte_rmaps_lama_map_process(orte_job_t *jdata, - orte_node_t *node, - int app_idx, - orte_proc_t **proc) -{ - int ret; - - /* - * Add this node to the map, but only once - */ - if( !ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED) ) { - if (ORTE_SUCCESS > (ret = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { - ORTE_ERROR_LOG(ret); - return ret; - } - ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); - OBJ_RETAIN(node); /* maintain accounting on object */ - ++(jdata->map->num_nodes); - } - - /* - * Setup the process object - */ - if (NULL == (*proc = orte_rmaps_base_setup_proc(jdata, node, app_idx))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - return ORTE_ERR_OUT_OF_RESOURCE; - } - - return ORTE_SUCCESS; -} - -static int rmaps_lama_ordering_sequential(orte_job_t *jdata) -{ - orte_job_map_t *map; - orte_proc_t *proc = NULL, *swap = NULL; - orte_std_cntr_t i, j; - int cur_rank = 0; - orte_node_t *cur_node = NULL; - - map = jdata->map; - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - /* - * Assign the ranks sequentially - */ - for( i = 0; i < map->nodes->size; ++i) { - if (NULL == (cur_node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { - continue; - } - for( j = 0; j < cur_node->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(cur_node->procs, j))) { - continue; - } - /* ignore procs from other jobs */ - if (proc->name.jobid != jdata->jobid) { - continue; - } - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Rename Proc. %2d to %2d (Rev. %s)", - proc->name.vpid, cur_rank, proc->node->name); - proc->name.vpid = cur_rank; - ++cur_rank; - } - } - - /* - * Fix the job structure ordering - Sort by new vpid - * - * If we do not do this then the remote daemons assign the incorrect - * ranks to the processes since they use the relative ordering in the - * jdata->procs structure to determine vpids locally. - * - * JJH: Look at combining these loops with the loop in the core so we - * JJH: do not have to iterate over the list two times - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - cur_rank = 0; - for( j = 0; j < jdata->procs->size; ++j) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { - continue; - } - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: Proc. %2d on Node %s", - proc->name.vpid, proc->node->name); - - while((int)proc->name.vpid != cur_rank ) { - swap = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid); - - opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); - opal_pointer_array_set_item(jdata->procs, cur_rank, swap); - - opal_output_verbose(15, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Ordering: \t SWAP Proc. %2d (%d) and Proc. %2d (%d)", - proc->name.vpid, cur_rank, swap->name.vpid, proc->name.vpid); - proc = swap; - } - ++cur_rank; - } - - return ORTE_SUCCESS; -} - -static int convert_layer_to_sort_idx(rmaps_lama_level_type_t layer) -{ - int i; - - for(i = 0; i < lama_mapping_num_layouts; ++i ) { - if( lama_mapping_layout_sort[i] == layer ) { - return i; - } - } - - return 0; -} - -static void display_pu_ref(int *ref, int size, int rank, orte_proc_t *proc) -{ - char *str = NULL; - - str = pu_ref_to_str(ref, size); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Mapping: PU Ref: %s [Rank %2d] Name: %s", - str, rank, - (NULL == proc ? "(null)" : ORTE_NAME_PRINT(&proc->name))); - - free(str); - - return; -} - -static char * pu_ref_to_str(int *ref, int size) -{ - int i, idx; - char *str = NULL; - - str = (char *)malloc(sizeof(char) * (2 * size)); - for(i = 0, idx = 0; i < size; ++i, idx += 2) { - sprintf(&(str[idx]), "%2d", ref[i]); - } - - return str; -} - -static int check_node_availability(orte_node_t *cur_node, - opal_tree_t *max_tree, - int *pu_idx_ref, - char **slot_list) -{ - int exit_status = ORTE_SUCCESS; - int i; - char * level_str = NULL; - hwloc_obj_t *topo_child = NULL, *topo_parent, *topo_allocated; - - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Checking: Node (%s) -------------", - cur_node->name); - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: ---------------------------------"); - - - /* - * Determine if the current node has the necessary hardware - * as described by the PU index. - * Find the hwloc object reference for the resource pointed to - * by the PU index. - * JJH TODO: If homogeneous system then this could be simplified. - */ - topo_allocated = topo_parent = (hwloc_obj_t*)malloc(sizeof(hwloc_obj_t) * 1); - if (NULL == topo_parent) { - return ORTE_ERROR; - } - *topo_parent = hwloc_get_obj_by_depth(cur_node->topology, 0, 0); - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - /* - * Skip 'machine' level - */ - if( LAMA_LEVEL_MACHINE == lama_mapping_layout_sort[i] ) { - continue; - } - /* - * Skip 'board' level - * JJH: HWLOC does not support BOARD at the moment - */ - if( LAMA_LEVEL_BOARD == lama_mapping_layout_sort[i] ) { - continue; - } - - level_str = lama_type_enum_to_str(lama_mapping_layout_sort[i]); - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Checking: %2d of %s", - pu_idx_ref[i], level_str); - - /* - * Find the nth subtree matching the current key - */ - topo_child = rmaps_lama_find_nth_subtree_match(cur_node->topology, - *topo_parent, - pu_idx_ref[i], - lama_mapping_layout_sort[i]); - - /* - * If it does not exist, then this node is not capable of matching - * so it is unavailable. - */ - if( NULL == topo_child ) { - opal_output_verbose(11, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check failed: Node %s does not have a %10s %2d", - cur_node->name, level_str, pu_idx_ref[i]); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Keep decending the tree - */ - topo_parent = topo_child; - free(level_str); - level_str = NULL; - } - - /* - * We have sufficient hardware :) - */ - - - /* - * Return the native slot list to bind to - * Internally checks the MPPR - */ - *slot_list = get_native_slot_list(cur_node, topo_parent, pu_idx_ref); - if( NULL == *slot_list ) { - goto cleanup; - } - - cleanup: - if( NULL != level_str ) { - free(level_str); - level_str = NULL; - } - - if( ORTE_SUCCESS != exit_status ) { - if( NULL != *slot_list ) { - free(*slot_list); - *slot_list = NULL; - } - } - - free(topo_allocated); - - return exit_status; -} - -static int rmaps_lama_check_mppr(orte_node_t *node, - hwloc_obj_t *child_obj) -{ - int ret; - - /* - * Optimization if no MPPR provided - */ - if( NULL == lama_mppr_levels ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: No MPPR to check - Skip..."); - return ORTE_SUCCESS; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - /* - * Check Parents (excluding self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, true)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - - /* - * Check Children (including self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, true)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Check ---------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_inc_mppr(orte_node_t *node, - hwloc_obj_t *child_obj) -{ - int ret; - - /* - * Optimization if no MPPR provided - */ - if( NULL == lama_mppr_levels ) { - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: No MPPR to increment - Skip..."); - return ORTE_SUCCESS; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - /* - * Increment Parents (excluding self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_parents(node, &(*child_obj)->parent, false)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - - /* - * Increment Children (including self) - */ - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, child_obj, false)) ) { - return ret; - } - - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Inc ---------------------------"); - - return ORTE_SUCCESS; -} - -static int rmaps_lama_iter_mppr_parents(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only) -{ - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - char str[128]; - - /* - * Basecase - */ - if( NULL == *child_obj ) { - return ORTE_SUCCESS; - } - - /* - * Check self - */ - /* - * Access MPPR info for this object - */ - hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - - hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: %s: P [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", - (check_only ? "Checking " : "Increment"), - node->index, node->name, str, - mppr_accounting->max, - (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), - (rmaps_lama_am_oversubscribing ? "T" : "F"), - (rmaps_lama_can_oversubscribe ? "T" : "F") ); - - /* - * Check limits - Error on first to exceed - */ - if( check_only ) { - if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { - if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { - return ORTE_ERROR; - } - } - } - /* - * Increment current number allocated below this level - */ - else { - mppr_accounting->cur += 1; - } - - /* - * Go to parent - */ - return rmaps_lama_iter_mppr_parents(node, &((*child_obj)->parent), check_only); -} - -static int rmaps_lama_iter_mppr_children(orte_node_t *node, - hwloc_obj_t *child_obj, - bool check_only) -{ - int ret; - rmaps_lama_hwloc_user_t *hwloc_userdata = NULL; - rmaps_lama_node_mppr_t *mppr_accounting = NULL; - char str[128]; - int i; - - /* - * Check self - */ - /* - * Access MPPR info for this object - */ - hwloc_userdata = (rmaps_lama_hwloc_user_t*)((opal_hwloc_topo_data_t*)(*child_obj)->userdata)->userdata; - mppr_accounting = (rmaps_lama_node_mppr_t*)opal_pointer_array_get_item(hwloc_userdata->node_mppr, node->index); - - hwloc_obj_snprintf(str, sizeof(str), node->topology, *child_obj, "#", 0); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: %s: C [%2d] %10s - %20s - Max %3d , Cur %3d (Oversub.: %s / %s)", - (check_only ? "Checking " : "Increment"), - node->index, node->name, str, - mppr_accounting->max, - (check_only ? mppr_accounting->cur : mppr_accounting->cur + 1), - (rmaps_lama_am_oversubscribing ? "T" : "F"), - (rmaps_lama_can_oversubscribe ? "T" : "F") ); - - /* - * Check limits - Error on first to exceed - */ - if( check_only ) { - if( mppr_accounting->max >= 0 && !rmaps_lama_am_oversubscribing) { - if( (mppr_accounting->cur)+1 > mppr_accounting->max ) { - return ORTE_ERROR; - } - } - } - /* - * Increment current number allocated below this level - */ - else { - mppr_accounting->cur += 1; - } - - /* - * Check all children - */ - for(i = 0; i < (int)(*child_obj)->arity; ++i ) { - if( ORTE_SUCCESS != (ret = rmaps_lama_iter_mppr_children(node, &((*child_obj)->children[i]), check_only)) ) { - return ret; - } - } - - return ORTE_SUCCESS; -} - - -static char * get_native_slot_list(orte_node_t *cur_node, hwloc_obj_t *pu_obj, int *put_idx_ref) -{ - int i; - char *slot_list = NULL; - hwloc_obj_t *binding_parent = NULL; - hwloc_obj_t *cur_parent = NULL; - hwloc_cpuset_t binding_cpuset; - hwloc_cpuset_t scratch_cpuset; - char *type_str = NULL; - - /* - * Sanity check - */ - if( NULL == pu_obj ) { - return NULL; - } - - /* - * Determine the cpumask to send to the backend for binding - */ - - /* - * Iterate up the tree until we reach the binding parent - */ - binding_parent = rmaps_lama_find_parent(cur_node->topology, pu_obj, lama_binding_level); - if( NULL == binding_parent ) { - return NULL; - } - - /* - * Iterate across cousins until we find enough resources or hit the node boundary - */ - binding_cpuset = hwloc_bitmap_alloc(); - hwloc_bitmap_zero(binding_cpuset); - - scratch_cpuset = hwloc_bitmap_alloc(); - - cur_parent = binding_parent; - - for(i = 0; i < lama_binding_num_levels; ++i) { - /* - * Check MPPR Availability - */ - if( ORTE_SUCCESS != rmaps_lama_check_mppr(cur_node, cur_parent) ) { - goto cleanup; - } - - /* - * Accumulate the bitmask - * - * JJH: TODO: Add resource offline check (?) - */ - hwloc_bitmap_zero(scratch_cpuset); - /* JJH: Maybe use opal_hwloc_base_get_available_cpus(cur_node->topology, (*cur_parent)) ? - * They do pretty much the same thing, but with more checks... - */ - hwloc_bitmap_and(scratch_cpuset, (*cur_parent)->allowed_cpuset, (*cur_parent)->online_cpuset); - hwloc_bitmap_or(binding_cpuset, scratch_cpuset, binding_cpuset); - -#if 0 - { - hwloc_obj_snprintf(str, sizeof(str), cur_node->topology, *cur_parent, "#", 0); - printf("--> BINDING TO -- %-20s \t -- %2d of %2d -- %2d vs %2d\n",str, - i, lama_binding_level, - (*binding_parent)->logical_index, (*cur_parent)->logical_index); - - hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->allowed_cpuset ); - printf("--> CPU A : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), (*cur_parent)->online_cpuset ); - printf("--> CPU B : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), scratch_cpuset); - printf("--> CPU C : %-20s\n", str); - hwloc_bitmap_snprintf(str, sizeof(str), binding_cpuset); - printf("--> CPU D : %-20s\n", str); - } -#endif - - /* - * Iterate to the next cousin. - * If we exceed the boundary of the node, then send up an error. - */ - if( (i+1) < lama_binding_num_levels && NULL == (*cur_parent)->next_cousin ) { - type_str = lama_type_enum_to_str(lama_binding_level); - opal_output_verbose(10, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Error: Not able to bind to %*d x %10s - Stopped at %*d", - MAX_BIND_DIGIT_LEN, lama_binding_num_levels, - type_str, - MAX_BIND_DIGIT_LEN, i); - free(type_str); - type_str = NULL; - goto cleanup; - } - /* - * Point to the next cousin - */ - if( NULL != (*cur_parent)->next_cousin ) { - cur_parent = &((*cur_parent)->next_cousin); - } - } - - /* - * Account for the process placement in the MPPR - * Assumes a previous check - * We cannot do this in the loop, since if the MPPR check fails we would - * need to roll back previous increments. - */ - cur_parent = binding_parent; - for(i = 0; i < lama_binding_num_levels; ++i) { - /* - * Account for the process placement in the MPPR - * Assumes a previous check. - */ - if( ORTE_SUCCESS != rmaps_lama_inc_mppr(cur_node, cur_parent) ) { - goto cleanup; - } - - /* - * Point to the next cousin - */ - if( NULL != (*cur_parent)->next_cousin ) { - cur_parent = &((*cur_parent)->next_cousin); - } - } - - /* - * Convert the cpuset to a slot_list for the remote daemon - */ - hwloc_bitmap_list_asprintf(&slot_list, binding_cpuset); - - cleanup: - hwloc_bitmap_free(scratch_cpuset); - hwloc_bitmap_free(binding_cpuset); - free(binding_parent); - - return slot_list; -} - - -/********************************* - * Timer Support - *********************************/ -static double rmaps_lama_get_time(void) -{ - double wtime; - -#if OPAL_TIMER_USEC_NATIVE - wtime = (double)opal_timer_base_get_usec() / 1000000.0; -#else - struct timeval tv; - gettimeofday(&tv, NULL); - wtime = tv.tv_sec; - wtime += (double)tv.tv_usec / 1000000.0; -#endif - - return wtime; -} - -static void rmaps_lama_set_time(int idx, bool is_start) -{ - if(idx < RMAPS_LAMA_TIMER_MAX ) { - if( is_start ) { - timer_start[idx] = rmaps_lama_get_time(); - } else { - timer_end[idx] = rmaps_lama_get_time(); - timer_accum[idx] += timer_end[idx] - timer_start[idx]; - } - } -} - -static void rmaps_lama_display_all_timers(void) -{ - double diff = 0.0; - double total = 0.0; - char * label = NULL; - - opal_output(0, - "mca:rmaps:lama: Timing: ---------------------------\n"); - - /* - * Timer: Parse Parameters - */ - label = strdup("Parse Params"); - diff = timer_accum[RMAPS_LAMA_TIMER_PARSE_PARAMS]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Build Max Tree - */ - label = strdup("Build Max Tree"); - diff = timer_accum[RMAPS_LAMA_TIMER_BUILD_MAX_TREE]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Mapping - */ - label = strdup("Mapping"); - diff = timer_accum[RMAPS_LAMA_TIMER_MAPPING]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Ordering - */ - label = strdup("Ordering"); - diff = timer_accum[RMAPS_LAMA_TIMER_ORDERING]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - total += diff; - - /* - * Timer: Total Overhead - */ - label = strdup("Other Overhead"); - diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; - rmaps_lama_display_indv_timer_core(diff - total, label); - free(label); - - /* - * Timer: Total - */ - label = strdup("Total"); - diff = timer_accum[RMAPS_LAMA_TIMER_TOTAL]; - rmaps_lama_display_indv_timer_core(diff, label); - free(label); - - opal_output(0, - "mca:rmaps:lama: ---------------------------------"); -} - -static void rmaps_lama_clear_timers(void) -{ - int i; - for(i = 0; i < RMAPS_LAMA_TIMER_MAX; ++i) { - timer_start[i] = 0.0; - timer_end[i] = 0.0; - timer_accum[i] = 0.0; - } -} - - -static void rmaps_lama_display_indv_timer_core(double diff, char *str) -{ - double perc = 0; - double total = 0; - - total = timer_end[RMAPS_LAMA_TIMER_TOTAL] - timer_start[RMAPS_LAMA_TIMER_TOTAL]; - perc = (diff/total) * 100; - - opal_output(0, - "mca:rmaps:lama: \t%-20s = %10.2f ms\t%6.2f %s\n", - str, (diff * 1000), perc, "%"); - return; -} diff --git a/orte/mca/rmaps/lama/rmaps_lama_params.c b/orte/mca/rmaps/lama/rmaps_lama_params.c deleted file mode 100644 index 6a54b4ba340..00000000000 --- a/orte/mca/rmaps/lama/rmaps_lama_params.c +++ /dev/null @@ -1,878 +0,0 @@ -/* - * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * Processing for command line interface options - * - */ -#include "rmaps_lama.h" - -#include "opal/util/argv.h" - -#include "orte/mca/rmaps/base/rmaps_private.h" -#include "orte/mca/rmaps/base/base.h" -#include "orte/util/show_help.h" - -#include - -/********************************* - * Local Functions - *********************************/ -/* - * QSort: Integer comparison - */ -static int lama_parse_int_sort(const void *a, const void *b); - -/* - * Convert the '-ppr' syntax from the 'ppr' component to the 'lama' '-mppr' syntax. - */ -static char * rmaps_lama_covert_ppr(char * given_ppr); - -/********************************* - * Parsing Functions - *********************************/ -int rmaps_lama_process_alias_params(orte_job_t *jdata) -{ - int exit_status = ORTE_SUCCESS; - - /* - * Mapping options - * Note: L1, L2, L3 are not exposed in orterun to the user, so - * there is no need to specify them here. - */ - if( NULL == rmaps_lama_cmd_map ) { - /* orte_rmaps_base.mapping */ - switch( ORTE_GET_MAPPING_POLICY(jdata->map->mapping) ) { - case ORTE_MAPPING_BYNODE: - /* rmaps_lama_cmd_map = strdup("nbNsL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("nbsch"); - break; - case ORTE_MAPPING_BYBOARD: - /* rmaps_lama_cmd_map = strdup("bnNsL3L2L1ch"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "by board", "mapping by board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - case ORTE_MAPPING_BYNUMA: - /* rmaps_lama_cmd_map = strdup("NbnsL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("Nbnsch"); - break; - case ORTE_MAPPING_BYSOCKET: - /* rmaps_lama_cmd_map = strdup("sNbnL3L2L1ch"); */ - rmaps_lama_cmd_map = strdup("sbnch"); - break; - case ORTE_MAPPING_BYL3CACHE: - rmaps_lama_cmd_map = strdup("L3sNbnL2L1ch"); - break; - case ORTE_MAPPING_BYL2CACHE: - rmaps_lama_cmd_map = strdup("L2sNbnL1ch"); - break; - case ORTE_MAPPING_BYL1CACHE: - rmaps_lama_cmd_map = strdup("L1sNbnch"); - break; - case ORTE_MAPPING_BYCORE: - case ORTE_MAPPING_BYSLOT: - /* rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); */ - rmaps_lama_cmd_map = strdup("csbnh"); - break; - case ORTE_MAPPING_BYHWTHREAD: - /* rmaps_lama_cmd_map = strdup("hcL1L2L3sNbn"); */ - rmaps_lama_cmd_map = strdup("hcsbn"); - break; - case ORTE_MAPPING_RR: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "round robin", "mapping by round robin not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - case ORTE_MAPPING_SEQ: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "sequential", "mapping by sequential not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - case ORTE_MAPPING_BYUSER: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - "by user", "mapping by user not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - default: - /* - * Default is map-by core - */ - rmaps_lama_cmd_map = strdup("cL1L2L3sNbnh"); - break; - } - } - - /* - * Binding Options - */ - if( NULL == rmaps_lama_cmd_bind ) { - /* - * No binding specified, use default - */ - if( !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) || - !OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy) || - OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { - rmaps_lama_cmd_bind = NULL; - } - - switch( OPAL_GET_BINDING_POLICY(jdata->map->binding) ) { - case OPAL_BIND_TO_BOARD: - /* rmaps_lama_cmd_bind = strdup("1b"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - "by board", "binding to board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - case OPAL_BIND_TO_NUMA: - rmaps_lama_cmd_bind = strdup("1N"); - break; - case OPAL_BIND_TO_SOCKET: - rmaps_lama_cmd_bind = strdup("1s"); - break; - case OPAL_BIND_TO_L3CACHE: - rmaps_lama_cmd_bind = strdup("1L3"); - break; - case OPAL_BIND_TO_L2CACHE: - rmaps_lama_cmd_bind = strdup("1L2"); - break; - case OPAL_BIND_TO_L1CACHE: - rmaps_lama_cmd_bind = strdup("1L1"); - break; - case OPAL_BIND_TO_CORE: - rmaps_lama_cmd_bind = strdup("1c"); - break; - case OPAL_BIND_TO_HWTHREAD: - rmaps_lama_cmd_bind = strdup("1h"); - break; - case OPAL_BIND_TO_CPUSET: - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - "by CPU set", "binding to CPU set not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - default: - rmaps_lama_cmd_bind = NULL; - break; - } - } - - /* - * Ordering (a.k.a. Ranking) Options - */ - if( NULL == rmaps_lama_cmd_ordering ) { - /* orte_rmaps_base.ranking */ - switch( ORTE_GET_RANKING_POLICY(jdata->map->ranking) ) { - case ORTE_RANK_BY_SLOT: - rmaps_lama_cmd_ordering = strdup("s"); - break; - case ORTE_RANK_BY_NODE: - case ORTE_RANK_BY_NUMA: - case ORTE_RANK_BY_SOCKET: - case ORTE_RANK_BY_L3CACHE: - case ORTE_RANK_BY_L2CACHE: - case ORTE_RANK_BY_L1CACHE: - case ORTE_RANK_BY_CORE: - case ORTE_RANK_BY_HWTHREAD: - rmaps_lama_cmd_ordering = strdup("n"); - break; - case ORTE_RANK_BY_BOARD: - /* rmaps_lama_cmd_ordering = strdup("n"); */ - orte_show_help("help-orte-rmaps-lama.txt", - "invalid ordering option", - true, - "by board", "ordering by board not supported by LAMA"); - exit_status = ORTE_ERR_NOT_SUPPORTED; - goto cleanup; - break; - default: - rmaps_lama_cmd_ordering = strdup("n"); - break; - } - } - - /* - * MPPR - */ - if( NULL == rmaps_lama_cmd_mppr ) { - /* - * The ppr is given in the map - */ - if( NULL != jdata->map->ppr) { - rmaps_lama_cmd_mppr = rmaps_lama_covert_ppr(jdata->map->ppr); - } - } - - /* - * Oversubscription - */ - if( ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping) ) { - rmaps_lama_can_oversubscribe = false; - } - else { - rmaps_lama_can_oversubscribe = true; - } - - /* - * Display revised values - */ - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Revised Parameters -----"); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Map : %s", - rmaps_lama_cmd_map); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Bind : %s", - rmaps_lama_cmd_bind); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: MPPR : %s", - rmaps_lama_cmd_mppr); - opal_output_verbose(5, orte_rmaps_base_framework.framework_output, - "mca:rmaps:lama: Order : %s", - rmaps_lama_cmd_ordering); - - cleanup: - return exit_status; -} - -static char * rmaps_lama_covert_ppr(char * given_ppr) -{ - return strdup(given_ppr); -} - -int rmaps_lama_parse_mapping(char *layout, - rmaps_lama_level_type_t **layout_types, - rmaps_lama_level_type_t **layout_types_sorted, - int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - int i, j, len; - bool found_req_param_n = false; - bool found_req_param_h = false; - bool found_req_param_bind = false; - - /* - * Sanity Check: - * There is no default layout, so if we get here and nothing is specified - * then this is an error. - */ - if( NULL == layout ) { - orte_show_help("help-orte-rmaps-lama.txt", - "internal error", - true, - "rmaps_lama_parse_mapping", - "internal error 1"); - return ORTE_ERROR; - } - - *num_types = 0; - - /* - * Extract and convert all the keys - */ - len = strlen(layout); - for(i = 0; i < len; ++i) { - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( layout[i] == 'L' ) { - param[0] = layout[i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, "cache level missing number"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = layout[i]; - param[2] = '\0'; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = layout[i]; - param[1] = '\0'; - } - - /* - * Append level - */ - *num_types += 1; - *layout_types = (rmaps_lama_level_type_t*)realloc(*layout_types, sizeof(rmaps_lama_level_type_t) * (*num_types)); - (*layout_types)[(*num_types)-1] = lama_type_str_to_enum(param); - } - - /* - * Check for duplicates and unknowns - * Copy to sorted list - */ - *layout_types_sorted = (rmaps_lama_level_type_t*)malloc(sizeof(rmaps_lama_level_type_t) * (*num_types)); - for( i = 0; i < *num_types; ++i ) { - /* - * Copy for later sorting - */ - (*layout_types_sorted)[i] = (*layout_types)[i]; - - /* - * Look for unknown and unsupported options - */ - if( LAMA_LEVEL_UNKNOWN <= (*layout_types)[i] ) { - char *msg; - asprintf(&msg, "unknown mapping level at position %d", i + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - if( LAMA_LEVEL_MACHINE == (*layout_types)[i] ) { - found_req_param_n = true; - } - - if( LAMA_LEVEL_PU == (*layout_types)[i] ) { - found_req_param_h = true; - } - - if( lama_binding_level == (*layout_types)[i] ) { - found_req_param_bind = true; - } - - /* - * Look for duplicates - */ - for( j = i+1; j < *num_types; ++j ) { - if( (*layout_types)[i] == (*layout_types)[j] ) { - char *msg; - asprintf(&msg, "duplicate mapping levels at position %d and %d", - i + 1, j + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - } - - /* - * The user is required to specify at least the: - * - machine - * - hardware thread (needed for lower bound binding) JJH: We should be able to lift this... - * - binding layer (need it to stride the mapping) - * Only print the error message once, for brevity. - */ - if( !found_req_param_n ) { - char *msg; - asprintf(&msg, "missing required 'n' mapping token"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - else if(!found_req_param_h) { - char *msg; - asprintf(&msg, "missing required 'h' mapping token"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } else if (!found_req_param_bind) { - char *msg; - asprintf(&msg, "missing required mapping token for the current binding level"); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mapping option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Sort the items - */ - qsort((*layout_types_sorted ), (*num_types), sizeof(int), lama_parse_int_sort); - - cleanup: - return exit_status; -} - -int rmaps_lama_parse_binding(char *layout, rmaps_lama_level_type_t *binding_level, int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - char num[MAX_BIND_DIGIT_LEN]; - int i, n, p, len; - - /* - * Default: If nothing specified - * - Bind to machine - */ - if( NULL == layout ) { - *binding_level = LAMA_LEVEL_MACHINE; - *num_types = 1; - return ORTE_SUCCESS; - } - - *num_types = 0; - - /* - * Extract and convert all the keys - */ - len = strlen(layout); - n = 0; - p = 0; - for(i = 0; i < len; ++i) { - /* - * Must start with a digit - */ - if( isdigit(layout[i]) ) { - /* - * Check: Digits must come first - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "missing digit(s) before binding level token"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - num[n] = layout[i]; - ++n; - /* - * Check: Exceed bound of number of digits - */ - if( n >= MAX_BIND_DIGIT_LEN ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "too many digits"); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - /* - * Extract the level - */ - else { - /* - * Check: Digits must come first - */ - if( n == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "missing digit(s) before binding level token"); - exit_status = ORTE_ERROR; - goto cleanup; - } - /* - * Check: Only one level allowed - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "only one binding level may be specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( layout[i] == 'L' ) { - param[0] = layout[i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "only one binding level may be specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = layout[i]; - p = 2; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = layout[i]; - p = 1; - } - param[p] = '\0'; - } - } - /* - * Check that the level was specified - */ - if( p == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "binding specification is empty"); - exit_status = ORTE_ERROR; - goto cleanup; - } - num[n] = '\0'; - - *binding_level = lama_type_str_to_enum(param); - *num_types = atoi(num); - - /* - * Check for unknown level - */ - if( LAMA_LEVEL_UNKNOWN <= *binding_level ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid binding option", - true, - layout, "unknown binding level"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - cleanup: - return exit_status; -} - -int rmaps_lama_parse_mppr(char *layout, rmaps_lama_level_info_t **mppr_levels, int *num_types) -{ - int exit_status = ORTE_SUCCESS; - char param[3]; - char num[MAX_BIND_DIGIT_LEN]; - char **argv = NULL; - int argc = 0; - int i, j, len; - int p, n; - - /* - * Default: Unrestricted allocation - * 'oversubscribe' flag accounted for elsewhere - */ - if( NULL == layout ) { - *mppr_levels = NULL; - *num_types = 0; - return ORTE_SUCCESS; - } - - *num_types = 0; - - /* - * Split by ',' - * <#:level>,<#:level>,... - */ - argv = opal_argv_split(layout, ','); - argc = opal_argv_count(argv); - for(j = 0; j < argc; ++j) { - /* - * Parse <#:level> - */ - len = strlen(argv[j]); - n = 0; - p = 0; - for(i = 0; i < len; ++i) { - /* - * Skip the ':' separator and whitespace - */ - if( argv[j][i] == ':' || isblank(argv[j][i])) { - continue; - } - /* - * Must start with a digit - */ - else if( isdigit(argv[j][i]) ) { - /* - * Check: Digits must come first - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "missing digit(s) before resource specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - num[n] = argv[j][i]; - ++n; - /* - * Check: Exceed bound of number of digits - */ - if( n >= MAX_BIND_DIGIT_LEN ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "too many digits"); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - /* - * Extract the level - */ - else { - /* - * Check: Digits must come first - */ - if( n == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "missing digit(s) before resource specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - /* - * Check: Only one level allowed - */ - if( p != 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "only one resource type may be listed per specification"); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * L1 : L1 Cache - * L2 : L2 Cache - * L3 : L3 Cache - */ - if( argv[j][i] == 'L' ) { - param[0] = argv[j][i]; - ++i; - /* - * Check for 2 characters - */ - if( i >= len ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "cache level missing number"); - exit_status = ORTE_ERROR; - goto cleanup; - } - param[1] = argv[j][i]; - p = 2; - } - /* - * n : Machine - * b : Board - * s : Socket - * c : Core - * h : Hardware Thread - * N : NUMA Node - */ - else { - param[0] = argv[j][i]; - p = 1; - } - param[p] = '\0'; - } - } - - /* - * Whitespace, just skip - */ - if( n == 0 && p == 0 ) { - continue; - } - - /* - * Check that the level was specified - */ - if( p == 0 ) { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, "resource type not specified"); - exit_status = ORTE_ERROR; - goto cleanup; - } - num[n] = '\0'; - - /* - * Append level - */ - *num_types += 1; - *mppr_levels = (rmaps_lama_level_info_t*)realloc(*mppr_levels, sizeof(rmaps_lama_level_info_t) * (*num_types)); - (*mppr_levels)[(*num_types)-1].type = lama_type_str_to_enum(param); - (*mppr_levels)[(*num_types)-1].max_resources = atoi(num); - - } - - /* - * Check for duplicates and unknowns - */ - for( i = 0; i < *num_types; ++i ) { - /* - * Look for unknown and unsupported options - */ - if( LAMA_LEVEL_UNKNOWN <= (*mppr_levels)[i].type ) { - char *msg; - asprintf(&msg, "unknown resource type at position %d", i + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - - /* - * Look for duplicates - */ - for( j = i+1; j < *num_types; ++j ) { - if( (*mppr_levels)[i].type == (*mppr_levels)[j].type ) { - char *msg; - asprintf(&msg, "duplicate resource tpyes at position %d and %d", - i + 1, j + 1); - orte_show_help("help-orte-rmaps-lama.txt", - "invalid mppr option", - true, - layout, msg); - free(msg); - exit_status = ORTE_ERROR; - goto cleanup; - } - } - } - - cleanup: - if( NULL != argv ) { - opal_argv_free(argv); - argv = NULL; - } - - return exit_status; -} - -int rmaps_lama_parse_ordering(char *layout, - rmaps_lama_order_type_t *order) -{ - /* - * Default: Natural ordering - */ - if( NULL == layout ) { - *order = LAMA_ORDER_NATURAL; - return ORTE_SUCCESS; - } - - /* - * Sequential Ordering - */ - if( 0 == strncmp(layout, "s", strlen("s")) || - 0 == strncmp(layout, "S", strlen("S")) ) { - *order = LAMA_ORDER_SEQ; - } - /* - * Natural Ordering - */ - else if( 0 == strncmp(layout, "n", strlen("n")) || - 0 == strncmp(layout, "N", strlen("N")) ) { - *order = LAMA_ORDER_NATURAL; - } - /* - * Check for unknown options - */ - else { - orte_show_help("help-orte-rmaps-lama.txt", - "invalid ordering option", - true, - "unsupported ordering option", layout); - return ORTE_ERROR; - } - - return ORTE_SUCCESS; -} - -bool rmaps_lama_ok_to_prune_level(rmaps_lama_level_type_t level) -{ - int i; - - for( i = 0; i < lama_mapping_num_layouts; ++i ) { - if( level == lama_mapping_layout[i] ) { - return false; - } - } - - return true; -} - -/********************************* - * Support Functions - *********************************/ -static int lama_parse_int_sort(const void *a, const void *b) { - int left = *((int*)a); - int right = *((int*)b); - - if( left < right ) { - return -1; - } - else if( left > right ) { - return 1; - } - else { - return 0; - } -} diff --git a/orte/mca/rmaps/mindist/rmaps_mindist_module.c b/orte/mca/rmaps/mindist/rmaps_mindist_module.c index 53ce91f71ae..29d5e7813b5 100644 --- a/orte/mca/rmaps/mindist/rmaps_mindist_module.c +++ b/orte/mca/rmaps/mindist/rmaps_mindist_module.c @@ -45,7 +45,7 @@ static int mindist_map(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_mindist_module = { - mindist_map + .map_job = mindist_map }; /* @@ -391,15 +391,6 @@ static int mindist_map(orte_job_t *jdata) } } - /* compute vpids and add proc objects to the job - do this after - * each app_context so that the ranks within each context are - * contiguous - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly @@ -415,6 +406,17 @@ static int mindist_map(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } free(orte_rmaps_base.device); + /* compute vpids and add proc objects to the job - do this after + * each app_context so that the ranks within each context are + * contiguous + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + /* mark the job as fully described */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); return ORTE_SUCCESS; error: @@ -425,3 +427,96 @@ static int mindist_map(orte_job_t *jdata) return rc; } + +#if 0 +static int assign_locations(orte_job_t *jdata) +{ + int j, k, m, n, npus; + orte_app_context_t *app; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + mca_base_component_t *c = &mca_rmaps_mindist_component.base_version; + int rc; + opal_list_t numa_list; + opal_rmaps_numa_node_t *numa; + + if (NULL == jdata->map->last_mapper|| + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* the mapper should have been set to me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:mindist: job %s not using mindist mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:mindist: assign locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* start assigning procs to objects, filling each object as we go until + * all procs are assigned. If one pass doesn't catch all the required procs, + * then loop thru the list again to handle the oversubscription + */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + + /* first we need to fill summary object for root with information about nodes + * so we call opal_hwloc_base_get_nbobjs_by_type */ + opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); + OBJ_CONSTRUCT(&numa_list, opal_list_t); + rc = opal_hwloc_get_sorted_numa_list(node->topology->topo, orte_rmaps_base.device, &numa_list); + if (rc > 1) { + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:several-devices", + true, orte_rmaps_base.device, rc, node->name); + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_TAKE_NEXT_OPTION; + } else if (rc < 0) { + orte_show_help("help-orte-rmaps-md.txt", "orte-rmaps-mindist:device-not-found", + true, orte_rmaps_base.device, node->name); + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + j = 0; + OPAL_LIST_FOREACH(numa, &numa_list, opal_rmaps_numa_node_t) { + /* get the hwloc object for this numa */ + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, HWLOC_OBJ_NODE, 0, numa->index, OPAL_HWLOC_AVAILABLE))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + OPAL_LIST_DESTRUCT(&numa_list); + return ORTE_ERR_NOT_FOUND; + } + npus = opal_hwloc_base_get_npus(node->topology->topo, obj); + /* fill the numa region with procs from this job until we either + * have assigned everyone or the region is full */ + for (k = j; k < node->procs->size && 0 < npus; k++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + ++j; + --npus; + } + } + OPAL_LIST_DESTRUCT(&numa_list); + } + } + + return ORTE_SUCCESS; +} +#endif diff --git a/orte/mca/rmaps/ppr/rmaps_ppr.c b/orte/mca/rmaps/ppr/rmaps_ppr.c index 35285e95cda..41523de3b6b 100644 --- a/orte/mca/rmaps/ppr/rmaps_ppr.c +++ b/orte/mca/rmaps/ppr/rmaps_ppr.c @@ -33,9 +33,11 @@ #include "rmaps_ppr.h" static int ppr_mapper(orte_job_t *jdata); +static int assign_locations(orte_job_t *jdata); orte_rmaps_base_module_t orte_rmaps_ppr_module = { - ppr_mapper + .map_job = ppr_mapper, + .assign_locations = assign_locations }; /* RHC: will eventually remove this @@ -391,11 +393,6 @@ static int ppr_mapper(orte_job_t *jdata) rc = ORTE_ERR_SILENT; goto error; } - /* compute vpids and add proc objects to the job */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - goto error; - } /* track the total number of processes we mapped - must update * this AFTER we compute vpids so that computation is done @@ -623,3 +620,122 @@ static void prune(orte_jobid_t jobid, error: opal_output(0, "INFINITE LOOP"); } + +static int assign_locations(orte_job_t *jdata) +{ + int i, j, m, n; + mca_base_component_t *c=&mca_rmaps_ppr_component.base_version; + orte_node_t *node; + orte_proc_t *proc; + orte_app_context_t *app; + opal_hwloc_level_t level; + hwloc_obj_t obj; + unsigned int cache_level=0; + int ppr, cnt, nobjs, nprocs_mapped; + char **ppr_req, **ck; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:ppr: job %s not using ppr assign: %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:ppr: assigning locations for job %s with ppr %s policy %s", + ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr, + orte_rmaps_base_print_mapping(jdata->map->mapping)); + + /* pickup the object level */ + if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_NODE_LEVEL; + } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_HWTHREAD_LEVEL; + } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_CORE_LEVEL; + } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_SOCKET_LEVEL; + } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L1CACHE_LEVEL; + cache_level = 1; + } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L2CACHE_LEVEL; + cache_level = 2; + } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_L3CACHE_LEVEL; + cache_level = 3; + } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + level = OPAL_HWLOC_NUMA_LEVEL; + } else { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + /* get the ppr value */ + ppr_req = opal_argv_split(jdata->map->ppr, ','); + ck = opal_argv_split(ppr_req[0], ':'); + ppr = strtol(ck[0], NULL, 10); + opal_argv_free(ck); + opal_argv_free(ppr_req); + + /* start assigning procs to objects, filling each object as we go until + * all procs are assigned. */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + nprocs_mapped = 0; + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + if (OPAL_HWLOC_NODE_LEVEL == level) { + obj = hwloc_get_root_obj(node->topology->topo); + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } else { + /* get the number of resources on this node at this level */ + nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, + level, cache_level, + OPAL_HWLOC_AVAILABLE); + + /* map the specified number of procs to each such resource on this node, + * recording the locale of each proc so we know its cpuset + */ + cnt = 0; + for (i=0; i < nobjs; i++) { + obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, + level, cache_level, + i, OPAL_HWLOC_AVAILABLE); + for (j=0; j < node->procs->size && cnt < ppr && nprocs_mapped < app->num_procs; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + continue; + } + nprocs_mapped++; + cnt++; + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + } + } + } + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmaps/rank_file/rmaps_rank_file.c b/orte/mca/rmaps/rank_file/rmaps_rank_file.c index 26d19f6881e..ee8651d5b2b 100644 --- a/orte/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/orte/mca/rmaps/rank_file/rmaps_rank_file.c @@ -51,6 +51,13 @@ #include "orte/mca/rmaps/rank_file/rmaps_rank_file_lex.h" #include "orte/runtime/orte_globals.h" +static int orte_rmaps_rf_map(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_rank_file_module = { + .map_job = orte_rmaps_rf_map +}; + + static int orte_rmaps_rank_file_parse(const char *); static char *orte_rmaps_rank_file_parse_string_or_int(void); static const char *orte_rmaps_rank_file_name_cur = NULL; @@ -363,6 +370,9 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) } } OBJ_DESTRUCT(&rankmap); + /* mark the job as fully described */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + return rc; error: @@ -371,11 +381,6 @@ static int orte_rmaps_rf_map(orte_job_t *jdata) return rc; } -orte_rmaps_base_module_t orte_rmaps_rank_file_module = { -orte_rmaps_rf_map -}; - - static int orte_rmaps_rank_file_parse(const char *rankfile) { int token; diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index afc4576737b..3ead4d31305 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -5,7 +5,7 @@ * Corporation. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * * $COPYRIGHT$ * @@ -36,6 +36,14 @@ #include "orte/mca/rmaps/base/base.h" #include "rmaps_resilient.h" +static int orte_rmaps_resilient_map(orte_job_t *jdata); +static int resilient_assign(orte_job_t *jdata); + +orte_rmaps_base_module_t orte_rmaps_resilient_module = { + .map_job = orte_rmaps_resilient_map, + .assign_locations = resilient_assign +}; + /* * Local variable @@ -270,9 +278,22 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) return rc; } -orte_rmaps_base_module_t orte_rmaps_resilient_module = { - orte_rmaps_resilient_map -}; +static int resilient_assign(orte_job_t *jdata) +{ + mca_base_component_t *c = &mca_rmaps_resilient_component.super.base_version; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:resilient: job %s not using resilient assign: %s", + ORTE_JOBID_PRINT(jdata->jobid), + (NULL == jdata->map->last_mapper) ? "NULL" : jdata->map->last_mapper); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + return ORTE_ERR_NOT_IMPLEMENTED; +} static char *orte_getline(FILE *fp) { @@ -855,15 +876,6 @@ static int map_to_ftgrps(orte_job_t *jdata) /* track number of procs */ jdata->num_procs += app->num_procs; - /* compute vpids and add proc objects to the job - this has to be - * done after each app_context is mapped in order to keep the - * vpids contiguous within an app_context - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* cleanup the node list - it can differ from one app_context * to another, so we have to get it every time */ @@ -873,11 +885,5 @@ static int map_to_ftgrps(orte_job_t *jdata) OBJ_DESTRUCT(&node_list); } - /* compute and save local ranks */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { - ORTE_ERROR_LOG(rc); - return rc; - } - return ORTE_SUCCESS; } diff --git a/orte/mca/rmaps/rmaps.h b/orte/mca/rmaps/rmaps.h index 9670c7ac2c8..4faaf2e2cb8 100644 --- a/orte/mca/rmaps/rmaps.h +++ b/orte/mca/rmaps/rmaps.h @@ -60,23 +60,30 @@ BEGIN_C_DECLS * rmaps module functions */ -/* mapping event - the event one activates to schedule mapping - * of procs to nodes for pending jobs - */ -ORTE_DECLSPEC extern opal_event_t orte_mapping_event; - /** * RMAPS module functions - these are not accessible to the outside world, * but are defined here by convention */ + +/* map a job - used by the HNP to compute the #procs on each node. + * This is passed to the backend daemons as a regex which they + * use to create an orte_job_map_t for the job */ typedef int (*orte_rmaps_base_module_map_fn_t)(orte_job_t *jdata); +/* assign a location to each process. Used by the backend daemons, + * this function takes the orte_job_map_t created from the regex + * and assigns each process to a specific location within the + * hardware topology based on the --map-by directive */ +typedef int (*orte_rmaps_base_module_assign_loc_fn_t)(orte_job_t *jdata); + /* * rmaps module version 3.0.0 */ struct orte_rmaps_base_module_3_0_0_t { /** Mapping function pointer */ orte_rmaps_base_module_map_fn_t map_job; + /* assign locations */ + orte_rmaps_base_module_assign_loc_fn_t assign_locations; }; /** Convenience typedef */ typedef struct orte_rmaps_base_module_3_0_0_t orte_rmaps_base_module_3_0_0_t; diff --git a/orte/mca/rmaps/round_robin/Makefile.am b/orte/mca/rmaps/round_robin/Makefile.am index 1f19dcc7657..bd51a226429 100644 --- a/orte/mca/rmaps/round_robin/Makefile.am +++ b/orte/mca/rmaps/round_robin/Makefile.am @@ -10,6 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -23,7 +24,8 @@ sources = \ rmaps_rr.c \ rmaps_rr.h \ rmaps_rr_component.c \ - rmaps_rr_mappers.c + rmaps_rr_mappers.c \ + rmaps_rr_assign.c # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.c b/orte/mca/rmaps/round_robin/rmaps_rr.c index a764e0243f3..b268c4953e7 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.c +++ b/orte/mca/rmaps/round_robin/rmaps_rr.c @@ -243,15 +243,6 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) goto error; } - /* compute vpids and add proc objects to the job - do this after - * each app_context so that the ranks within each context are - * contiguous - */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* track the total number of processes we mapped - must update * this value AFTER we compute vpids so that computation * is done correctly @@ -278,6 +269,113 @@ static int orte_rmaps_rr_map(orte_job_t *jdata) return rc; } +static int orte_rmaps_rr_assign_locations(orte_job_t *jdata) +{ + mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version; + int rc; + + if (NULL == jdata->map->last_mapper || + 0 != strcasecmp(jdata->map->last_mapper, c->mca_component_name)) { + /* a mapper has been specified, and it isn't me */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: job %s not using rr mapper", + ORTE_JOBID_PRINT(jdata->jobid)); + return ORTE_ERR_TAKE_NEXT_OPTION; + } + + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assign locations for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + /* if the mapping directive was byslot or bynode, then we + * assign locations to the root object level */ + if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping) || + ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + return orte_rmaps_rr_assign_root_level(jdata); + } + + /* otherwise, assign by object */ + if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_PU, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't assign by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CORE, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 1); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 2); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_CACHE, 3); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_SOCKET, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { + rc = orte_rmaps_rr_assign_byobj(jdata, HWLOC_OBJ_NODE, 0); + if (ORTE_ERR_NOT_FOUND == rc) { + /* if the mapper couldn't map by this object because + * it isn't available, but the error allows us to try + * byslot, then do so + */ + ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSLOT); + rc = orte_rmaps_rr_assign_root_level(jdata); + } + } else { + /* unrecognized mapping directive */ + orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", + true, "mapping", + orte_rmaps_base_print_mapping(jdata->map->mapping)); + rc = ORTE_ERR_SILENT; + } + return rc; +} + orte_rmaps_base_module_t orte_rmaps_round_robin_module = { - orte_rmaps_rr_map + .map_job = orte_rmaps_rr_map, + .assign_locations = orte_rmaps_rr_assign_locations }; diff --git a/orte/mca/rmaps/round_robin/rmaps_rr.h b/orte/mca/rmaps/round_robin/rmaps_rr.h index 6591a3b6c20..4d998bbbba1 100644 --- a/orte/mca/rmaps/round_robin/rmaps_rr.h +++ b/orte/mca/rmaps/round_robin/rmaps_rr.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Cisco Systems, Inc. All rights reserved * $COPYRIGHT$ * @@ -54,6 +54,13 @@ ORTE_MODULE_DECLSPEC int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level); +ORTE_MODULE_DECLSPEC int orte_rmaps_rr_assign_root_level(orte_job_t *jdata); + +ORTE_MODULE_DECLSPEC int orte_rmaps_rr_assign_byobj(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level); + + END_C_DECLS #endif diff --git a/orte/mca/rmaps/round_robin/rmaps_rr_assign.c b/orte/mca/rmaps/round_robin/rmaps_rr_assign.c new file mode 100644 index 00000000000..81fa0b67b08 --- /dev/null +++ b/orte/mca/rmaps/round_robin/rmaps_rr_assign.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include + +#include "opal/util/output.h" +#include "opal/mca/hwloc/base/base.h" + +#include "orte/util/show_help.h" +#include "orte/util/name_fns.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" + +#include "orte/mca/rmaps/base/rmaps_private.h" +#include "orte/mca/rmaps/base/base.h" +#include "rmaps_rr.h" + +int orte_rmaps_rr_assign_root_level(orte_job_t *jdata) +{ + int i, m; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning procs to root level for job %s", + ORTE_JOBID_PRINT(jdata->jobid)); + + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:slot working node %s", + node->name); + /* get the root object as we are not assigning + * locale here except at the node level */ + if (NULL == node->topology || NULL == node->topology->topo) { + /* nothing we can do */ + continue; + } + obj = hwloc_get_root_obj(node->topology->topo); + for (i=0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:assign skipping proc %s - from another job", + ORTE_NAME_PRINT(&proc->name)); + continue; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + return ORTE_SUCCESS; +} + +/* mapping by hwloc object looks a lot like mapping by node, + * but has the added complication of possibly having different + * numbers of objects on each node + */ +int orte_rmaps_rr_assign_byobj(orte_job_t *jdata, + hwloc_obj_type_t target, + unsigned cache_level) +{ + int start, j, m, n; + orte_app_context_t *app; + orte_node_t *node; + orte_proc_t *proc; + hwloc_obj_t obj=NULL; + unsigned int nobjs; + + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning locations by %s for job %s", + hwloc_obj_type_string(target), + ORTE_JOBID_PRINT(jdata->jobid)); + + + /* start mapping procs onto objects, filling each object as we go until + * all procs are mapped. If one pass doesn't catch all the required procs, + * then loop thru the list again to handle the oversubscription + */ + for (n=0; n < jdata->apps->size; n++) { + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + for (m=0; m < jdata->map->nodes->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { + continue; + } + if (NULL == node->topology || NULL == node->topology->topo) { + orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", + true, node->name); + return ORTE_ERR_SILENT; + } + /* get the number of objects of this type on this node */ + nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); + if (0 == nobjs) { + continue; + } + opal_output_verbose(2, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: found %u %s objects on node %s", + nobjs, hwloc_obj_type_string(target), node->name); + + /* if this is a comm_spawn situation, start with the object + * where the parent left off and increment */ + if (ORTE_JOBID_INVALID != jdata->originator.jobid) { + start = (jdata->bkmark_obj + 1) % nobjs; + } else { + start = 0; + } + /* loop over the procs on this node */ + for (j=0; j < node->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { + continue; + } + /* ignore procs from other jobs */ + if (proc->name.jobid != jdata->jobid) { + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr:assign skipping proc %s - from another job", + ORTE_NAME_PRINT(&proc->name)); + continue; + } + /* ignore procs from other apps */ + if (proc->app_idx != app->idx) { + continue; + } + opal_output_verbose(20, orte_rmaps_base_framework.framework_output, + "mca:rmaps:rr: assigning proc to object %d", (j + start) % nobjs); + /* get the hwloc object */ + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, (j + start) % nobjs, OPAL_HWLOC_AVAILABLE))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + if (orte_rmaps_base.cpus_per_rank > (int)opal_hwloc_base_get_npus(node->topology->topo, obj)) { + orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low", true, + orte_rmaps_base.cpus_per_rank, opal_hwloc_base_get_npus(node->topology->topo, obj), + orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); + return ORTE_ERR_SILENT; + } + orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR); + } + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/rmaps/seq/rmaps_seq.c b/orte/mca/rmaps/seq/rmaps_seq.c index 623a2184f59..9bbe2253964 100644 --- a/orte/mca/rmaps/seq/rmaps_seq.c +++ b/orte/mca/rmaps/seq/rmaps_seq.c @@ -54,7 +54,7 @@ static int orte_rmaps_seq_map(orte_job_t *jdata); /* define the module */ orte_rmaps_base_module_t orte_rmaps_seq_module = { - orte_rmaps_seq_map + .map_job = orte_rmaps_seq_map }; /* local object for tracking rank locations */ @@ -517,6 +517,10 @@ static int orte_rmaps_seq_map(orte_job_t *jdata) } } + /* mark that this job is to be fully + * described in the launch msg */ + orte_set_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); + return ORTE_SUCCESS; error: diff --git a/orte/mca/state/base/state_base_fns.c b/orte/mca/state/base/state_base_fns.c index 69cfa8945a8..38c27ba08a2 100644 --- a/orte/mca/state/base/state_base_fns.c +++ b/orte/mca/state/base/state_base_fns.c @@ -899,8 +899,6 @@ void orte_state_base_check_all_complete(int fd, short args, void *cbdata) opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); - /* flag that the node is no longer in a map */ - ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; diff --git a/orte/mca/state/dvm/state_dvm.c b/orte/mca/state/dvm/state_dvm.c index 6fcecd26bee..d095813594f 100644 --- a/orte/mca/state/dvm/state_dvm.c +++ b/orte/mca/state/dvm/state_dvm.c @@ -255,7 +255,7 @@ static void vm_ready(int fd, short args, void *cbdata) /* if we couldn't provide the allocation regex on the orted * cmd line, then we need to provide all the info here */ if (!orte_nidmap_communicated) { - if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(&nidmap))) { + if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, &nidmap))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; diff --git a/orte/mca/state/hnp/state_hnp.c b/orte/mca/state/hnp/state_hnp.c index c18c4a0e01a..cfde6135390 100644 --- a/orte/mca/state/hnp/state_hnp.c +++ b/orte/mca/state/hnp/state_hnp.c @@ -73,6 +73,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -91,6 +93,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, orte_plm_base_vm_ready, + orte_rmaps_base_map_job, + orte_plm_base_mapping_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, diff --git a/orte/mca/state/novm/state_novm.c b/orte/mca/state/novm/state_novm.c index 512f6cc43dd..72d7c0bd397 100644 --- a/orte/mca/state/novm/state_novm.c +++ b/orte/mca/state/novm/state_novm.c @@ -61,6 +61,7 @@ orte_state_base_module_t orte_state_novm_module = { }; static void allocation_complete(int fd, short args, void *cbdata); +static void map_complete(int fd, short args, void *cbdata); static void vm_ready(int fd, short args, void *cbdata); /* defined state machine sequence for no VM - individual @@ -74,6 +75,8 @@ static orte_job_state_t launch_states[] = { ORTE_JOB_STATE_DAEMONS_LAUNCHED, ORTE_JOB_STATE_DAEMONS_REPORTED, ORTE_JOB_STATE_VM_READY, + ORTE_JOB_STATE_MAP, + ORTE_JOB_STATE_MAP_COMPLETE, ORTE_JOB_STATE_SYSTEM_PREP, ORTE_JOB_STATE_LAUNCH_APPS, ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE, @@ -93,6 +96,8 @@ static orte_state_cbfunc_t launch_callbacks[] = { orte_plm_base_daemons_launched, orte_plm_base_daemons_reported, vm_ready, + orte_rmaps_base_map_job, + map_complete, orte_plm_base_complete_setup, orte_plm_base_launch_apps, orte_state_base_local_launch_complete, @@ -195,7 +200,7 @@ static void allocation_complete(int fd, short args, void *cbdata) orte_job_t *daemons; orte_topology_t *t; orte_node_t *node; - int i, rc; + int i; jdata->state = ORTE_JOB_STATE_ALLOCATION_COMPLETE; @@ -235,21 +240,27 @@ static void allocation_complete(int fd, short args, void *cbdata) } } - /* perform the map */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_map_job(jdata))) { - ORTE_ERROR_LOG(rc); - ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); - goto done; - } - - /* after we map, we are ready to launch the daemons */ - ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + /* move to the map stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); done: /* cleanup */ OBJ_RELEASE(state); } +/* after we map, we are ready to launch the daemons */ +static void map_complete(int fd, short args, void *cbdata) +{ + orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; + orte_job_t *jdata = state->jdata; + + jdata->state = ORTE_JOB_STATE_MAP_COMPLETE; + /* move to the map stage */ + ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_LAUNCH_DAEMONS); + + /* cleanup */ + OBJ_RELEASE(state); +} static void vm_ready(int fd, short args, void *cbdata) { diff --git a/orte/orted/pmix/pmix_server_pub.c b/orte/orted/pmix/pmix_server_pub.c index 4dcb9cfb755..6b3e5bde785 100644 --- a/orte/orted/pmix/pmix_server_pub.c +++ b/orte/orted/pmix/pmix_server_pub.c @@ -99,6 +99,10 @@ int pmix_server_publish_fn(opal_process_name_t *proc, opal_pmix_persistence_t persist = OPAL_PMIX_PERSIST_APP; bool rset, pset; + opal_output_verbose(1, orte_pmix_server_globals.output, + "%s orted:pmix:server PUBLISH", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + /* create the caddy */ req = OBJ_NEW(pmix_server_req_t); (void)asprintf(&req->operation, "PUBLISH: %s:%d", __FILE__, __LINE__); @@ -259,6 +263,10 @@ int pmix_server_lookup_fn(opal_process_name_t *proc, char **keys, /* pack the keys too */ for (i=0; i < nkeys; i++) { + opal_output_verbose(5, orte_pmix_server_globals.output, + "%s lookup data %s for proc %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i], + ORTE_NAME_PRINT(proc)); if (OPAL_SUCCESS != (rc = opal_dss.pack(&req->msg, &keys[i], 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(req); diff --git a/orte/runtime/data_type_support/orte_dt_packing_fns.c b/orte/runtime/data_type_support/orte_dt_packing_fns.c index c5914169198..04e434645f6 100644 --- a/orte/runtime/data_type_support/orte_dt_packing_fns.c +++ b/orte/runtime/data_type_support/orte_dt_packing_fns.c @@ -89,6 +89,53 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } + /* pack the attributes that need to be sent */ + count = 0; + OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + ++count; + } + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { + if (ORTE_ATTR_GLOBAL == kv->local) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* check for job info attribute */ + cache = NULL; + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) && + NULL != cache) { + /* we need to pack these as well, but they are composed + * of opal_value_t's on a list. So first pack the number + * of list elements */ + count = opal_list_get_size(cache); + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* now pack each element on the list */ + OPAL_LIST_FOREACH(val, cache, opal_value_t) { + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&val, 1, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } else { + /* pack a zero to indicate no job info is being passed */ + count = 0; + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* pack the personality */ count = opal_argv_count(jobs[i]->personality); if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &count, 1, OPAL_INT32))) { @@ -134,14 +181,18 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, return rc; } - if (orte_no_vm && 0 < jobs[i]->num_procs) { - for (j=0; j < jobs[i]->procs->size; j++) { - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { - continue; - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&proc, 1, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + if (0 < jobs[i]->num_procs) { + /* check attributes to see if this job is to be fully + * described in the launch msg */ + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + for (j=0; j < jobs[i]->procs->size; j++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jobs[i]->procs, j))) { + continue; + } + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&proc, 1, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } } } } @@ -198,53 +249,6 @@ int orte_dt_pack_job(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } - - /* pack the attributes that need to be sent */ - count = 0; - OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { - if (ORTE_ATTR_GLOBAL == kv->local) { - ++count; - } - } - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - OPAL_LIST_FOREACH(kv, &jobs[i]->attributes, orte_attribute_t) { - if (ORTE_ATTR_GLOBAL == kv->local) { - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&kv, 1, ORTE_ATTRIBUTE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } - /* check for job info attribute */ - cache = NULL; - if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) && - NULL != cache) { - /* we need to pack these as well, but they are composed - * of opal_value_t's on a list. So first pack the number - * of list elements */ - count = opal_list_get_size(cache); - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* now pack each element on the list */ - OPAL_LIST_FOREACH(val, cache, opal_value_t) { - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)&val, 1, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - } else { - /* pack a zero to indicate no job info is being passed */ - count = 0; - if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, (void*)(&count), 1, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } } return ORTE_SUCCESS; } @@ -594,7 +598,11 @@ int orte_dt_pack_map(opal_buffer_t *buffer, const void *src, ORTE_ERROR_LOG(rc); return rc; } - + /* pack the last mapper */ + if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->last_mapper), 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } /* pack the policies */ if (ORTE_SUCCESS != (rc = opal_dss_pack_buffer(buffer, &(maps[i]->mapping), 1, ORTE_MAPPING_POLICY))) { ORTE_ERROR_LOG(rc); diff --git a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c index 93df939c8fb..6e49c160520 100644 --- a/orte/runtime/data_type_support/orte_dt_unpacking_fns.c +++ b/orte/runtime/data_type_support/orte_dt_unpacking_fns.c @@ -95,6 +95,44 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the attributes */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv, + &n, ORTE_ATTRIBUTE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value + opal_list_append(&jobs[i]->attributes, &kv->super); + } + /* unpack any job info */ + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, + &n, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 < count){ + cache = OBJ_NEW(opal_list_t); + orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR); + for (k=0; k < count; k++) { + n=1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val, + &n, OPAL_VALUE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_list_append(cache, &val->super); + } + } + /* unpack the personality */ n=1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, &n, OPAL_INT32))) { @@ -147,16 +185,20 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, return rc; } - if (orte_no_vm && 0 < jobs[i]->num_procs) { - orte_proc_t *proc; - for (j=0; j < jobs[i]->num_procs; j++) { - n = 1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, - &proc, &n, ORTE_PROC))) { - ORTE_ERROR_LOG(rc); - return rc; + if (0 < jobs[i]->num_procs) { + /* check attributes to see if this job was fully + * described in the launch msg */ + if (orte_get_attribute(&jobs[i]->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) { + orte_proc_t *proc; + for (j=0; j < jobs[i]->num_procs; j++) { + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &proc, &n, ORTE_PROC))) { + ORTE_ERROR_LOG(rc); + return rc; + } + opal_pointer_array_add(jobs[i]->procs, proc); } - opal_pointer_array_add(jobs[i]->procs, proc); } } @@ -204,44 +246,6 @@ int orte_dt_unpack_job(opal_buffer_t *buffer, void *dest, ORTE_ERROR_LOG(rc); return rc; } - - /* unpack the attributes */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, - &n, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - for (k=0; k < count; k++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv, - &n, ORTE_ATTRIBUTE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - kv->local = ORTE_ATTR_GLOBAL; // obviously not a local value - opal_list_append(&jobs[i]->attributes, &kv->super); - } - /* unpack any job info */ - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count, - &n, ORTE_STD_CNTR))) { - ORTE_ERROR_LOG(rc); - return rc; - } - if (0 < count){ - cache = OBJ_NEW(opal_list_t); - orte_set_attribute(&jobs[i]->attributes, ORTE_JOB_INFO_CACHE, ORTE_ATTR_LOCAL, (void*)cache, OPAL_PTR); - for (k=0; k < count; k++) { - n=1; - if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &val, - &n, OPAL_VALUE))) { - ORTE_ERROR_LOG(rc); - return rc; - } - opal_list_append(cache, &val->super); - } - } } return ORTE_SUCCESS; @@ -655,6 +659,14 @@ int orte_dt_unpack_map(opal_buffer_t *buffer, void *dest, return rc; } + /* unpack the last mapper */ + n = 1; + if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, + &(maps[i]->last_mapper), &n, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* unpack the policies */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, diff --git a/orte/runtime/orte_data_server.c b/orte/runtime/orte_data_server.c index 807f13f5911..605b0acd077 100644 --- a/orte/runtime/orte_data_server.c +++ b/orte/runtime/orte_data_server.c @@ -12,7 +12,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2016 Los Alamos National Security, LLC. * All rights reserved - * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -111,6 +111,8 @@ OBJ_CLASS_INSTANCE(orte_data_req_t, static opal_pointer_array_t orte_data_server_store; static opal_list_t pending; static bool initialized = false; +static int orte_data_server_output = -1; +static int orte_data_server_verbosity = -1; int orte_data_server_init(void) { @@ -121,6 +123,19 @@ int orte_data_server_init(void) } initialized = true; + /* register a verbosity */ + orte_data_server_verbosity = -1; + (void) mca_base_var_register ("orte", "orte", "data", "server_verbose", + "Debug verbosity for ORTE data server", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, + &orte_data_server_verbosity); + if (0 <= orte_data_server_verbosity) { + orte_data_server_output = opal_output_open(NULL); + opal_output_set_verbosity(orte_data_server_output, + orte_data_server_verbosity); + } + OBJ_CONSTRUCT(&orte_data_server_store, opal_pointer_array_t); if (ORTE_SUCCESS != (rc = opal_pointer_array_init(&orte_data_server_store, 1, @@ -180,7 +195,7 @@ void orte_data_server(int status, orte_process_name_t* sender, orte_data_req_t *req, *rqnext; orte_jobid_t jobid = ORTE_JOBID_INVALID; - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server got message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); @@ -218,7 +233,7 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: publishing data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&data->owner))); @@ -245,7 +260,7 @@ void orte_data_server(int status, orte_process_name_t* sender, data->uid = iptr->data.uint32; OBJ_RELEASE(iptr); } else { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: adding %s to data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -255,7 +270,7 @@ void orte_data_server(int status, orte_process_name_t* sender, data->index = opal_pointer_array_add(&orte_data_server_store, data); - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: checking for pending requests", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -276,7 +291,14 @@ void orte_data_server(int status, orte_process_name_t* sender, for (i=0; NULL != req->keys[i]; i++) { /* cycle thru the data keys for matches */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tCHECKING %s TO %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + iptr->key, req->keys[i])); if (0 == strcmp(iptr->key, req->keys[i])) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s data server: packaging return", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* found it - package it for return */ if (NULL == reply) { reply = OBJ_NEW(opal_buffer_t); @@ -296,7 +318,7 @@ void orte_data_server(int status, orte_process_name_t* sender, ORTE_ERROR_LOG(rc); break; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: adding %s data from %s to response", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -309,7 +331,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } if (NULL != reply) { /* send it back to the requestor */ - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: returning data to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&req->requestor))); @@ -326,11 +348,11 @@ void orte_data_server(int status, orte_process_name_t* sender, reply = NULL; /* if the persistence is "first_read", then delete this data */ if (OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s NOT STORING DATA FROM %s AT INDEX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&data->owner), data->index)); - opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL); + ORTE_NAME_PRINT(&data->owner), data->index); + opal_pointer_array_set_item(&orte_data_server_store, data->index, NULL)); OBJ_RELEASE(data); goto release; } @@ -349,7 +371,7 @@ void orte_data_server(int status, orte_process_name_t* sender, break; case ORTE_PMIX_LOOKUP_CMD: - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: lookup data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); @@ -416,7 +438,7 @@ void orte_data_server(int status, orte_process_name_t* sender, /* cycle across the provided keys */ ret_packed = false; for (i=0; NULL != keys[i]; i++) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s data server: looking for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i])); /* cycle across the stored data, looking for a match */ @@ -428,6 +450,10 @@ void orte_data_server(int status, orte_process_name_t* sender, } /* for security reasons, can only access data posted by the same user id */ if (uid != data->uid) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tMISMATCH UID %u %u", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (unsigned)uid, (unsigned)data->uid)); continue; } /* if the published range is constrained to namespace, then only @@ -435,12 +461,17 @@ void orte_data_server(int status, orte_process_name_t* sender, * in the same namespace as the requestor */ if (OPAL_PMIX_RANGE_NAMESPACE == data->range) { if (jobid != data->owner.jobid) { + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, + "%s\tMISMATCH JOBID %s %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobid), + ORTE_JOBID_PRINT(data->owner.jobid))); continue; } } /* see if we have this key */ OPAL_LIST_FOREACH(iptr, &data->values, opal_value_t) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((10, orte_data_server_output, "%s COMPARING %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), keys[i], iptr->key)); @@ -461,7 +492,7 @@ void orte_data_server(int status, orte_process_name_t* sender, opal_argv_free(keys); goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: adding %s to data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), iptr->key, ORTE_NAME_PRINT(&data->owner))); @@ -473,7 +504,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } } if (data_added && OPAL_PMIX_PERSIST_FIRST_READ == data->persistence) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s REMOVING DATA FROM %s AT INDEX %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&data->owner), data->index)); @@ -483,14 +514,14 @@ void orte_data_server(int status, orte_process_name_t* sender, } } if (!ret_packed) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: data not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we were told to wait for the data, then queue this up * for later processing */ if (wait) { - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: pushing request to wait", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(answer); @@ -510,7 +541,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } opal_argv_free(keys); - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server:lookup: data found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto SEND_ANSWER; @@ -524,7 +555,7 @@ void orte_data_server(int status, orte_process_name_t* sender, goto SEND_ERROR; } - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: unpublish data from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&requestor))); @@ -629,7 +660,7 @@ void orte_data_server(int status, orte_process_name_t* sender, } SEND_ERROR: - OPAL_OUTPUT_VERBOSE((1, orte_debug_output, + OPAL_OUTPUT_VERBOSE((1, orte_data_server_output, "%s data server: sending error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); @@ -646,5 +677,3 @@ void orte_data_server(int status, orte_process_name_t* sender, OBJ_RELEASE(answer); } } - - diff --git a/orte/test/mpi/Makefile b/orte/test/mpi/Makefile index 3bf63b8b0b3..47f183a6e57 100644 --- a/orte/test/mpi/Makefile +++ b/orte/test/mpi/Makefile @@ -1,4 +1,11 @@ -PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib +PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \ + concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \ + bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \ + crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \ + parallel_w8 parallel_w64 parallel_r8 parallel_r64 sio sendrecv_blaster early_abort \ + debugger singleton_client_server intercomm_create spawn_tree init-exit77 mpi_info \ + info_spawn server client paccept pconnect ring hello.sapp binding badcoll attach xlib \ + no-disconnect all: $(PROGS) diff --git a/orte/test/mpi/no-disconnect.c b/orte/test/mpi/no-disconnect.c new file mode 100644 index 00000000000..9403b3ff345 --- /dev/null +++ b/orte/test/mpi/no-disconnect.c @@ -0,0 +1,210 @@ +/* Contributed by Marcia Cristina Cera + , + http://www.open-mpi.org/community/lists/users/2009/12/11540.php */ + +/* It was decided that the issue highlighted by this test will NOT be + fixed in the 1.3/1.4 series. It is already fixed in the 1.5 + series. Hence, if we detect Open MPI < v1.5, return 77/skip. */ +/* Turns out the hnp cannot handle concurrent MPI_Comm_spawns + as of Open MPI 1.7. However, we hope this feature will + work in 2.0. with the new state machine based orte. */ + +#include +#include +#include +#include +#include +#include + +#include + +#define NCHARS 30 +const int max_depth = 4; + +/* + * Here are some replacements for standard, blocking MPI + * functions. These replacements are "nice" and yield the + * CPU instead of spinning hard. The interfaces are the same. + * Just replace: + * MPI_Recv with nice_recv + * MPI_Send with nice_send + * MPI_Barrier with nice_barrier + */ + + +static int nice_send(void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm) { + /* Assume a standard (presumably short/eager) send suffices. */ + return MPI_Send(buf, count, datatype, dest, tag, comm); +} + + +static int nice_recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status) { + MPI_Request req; + int flag; + struct timespec dt; + + /* + * We're only interested in modest levels of oversubscription + * -- e.g., 2-4x more processes than physical processors. + * So, the sleep time only needs to be about 2-4x longer than + * a futile MPI_Test call. For a wide range of processors, + * something less than a millisecond should be sufficient. + * Excessive sleep times (e.g., 1 second) would degrade performance. + */ + dt.tv_sec = 0; + dt.tv_nsec = 100000; + + MPI_Irecv(buf, count, datatype, source, tag, comm, &req); + + MPI_Test(&req, &flag, status); + while ( ! flag ) { + nanosleep(&dt, NULL); + MPI_Test(&req, &flag, status); + } + return MPI_SUCCESS; +} + + +static void nice_barrier(MPI_Comm comm) { + int me, np, jump, buf = -1; + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&np); + + /* fan in */ + for ( jump = 1; jump < np; jump <<= 1 ) { + if ( ( me & jump ) != 0 ) { + nice_send(&buf, 1, MPI_INT, me - jump, 343, comm); + break; + } else if ( me + jump < np ) { + nice_recv(&buf, 1, MPI_INT, me + jump, 343, comm, MPI_STATUS_IGNORE); + } + } + + /* fan out */ + if ( 0 != me ) { + nice_recv(&buf, 1, MPI_INT, me - jump, 344, comm, MPI_STATUS_IGNORE); + } + jump >>= 1; + for ( ; jump > 0; jump >>= 1 ) { + if ( me + jump < np ) { + nice_send(&buf, 1, MPI_INT, me + jump, 344, comm); + } + } +} + + +int main (int argc, char **argv) +{ + char bufs [NCHARS]; /* send buffer */ + char bufr[2][NCHARS]; /* recv buffers */ + MPI_Comm parent; + int level = 0, participate = 1; + struct utsname buf; + + /* If this is prior to OMPI v2.0, return 77/skip */ +#if defined(OPEN_MPI) + if (OMPI_MAJOR_VERSION < 2) { + printf("Skipping, because the orte cannot handle concurrent MPI_Comm_spawns\n"); + return 77; + } else { + printf("Verify that this test is truly working because conncurrent MPI_Comm_spawns" + " has not worked before.\n"); + } +#endif + + uname(&buf); + printf("I AM pid %d with level %d on %s\n", getpid(), (argc < 2)?0:atoi(argv[1]), buf.nodename); + + MPI_Init(&argc, &argv); + MPI_Comm_get_parent(&parent); + + if (MPI_COMM_NULL != parent) { + /* spawned processes get stuff from parent */ + level = atoi(argv[1]); + MPI_Recv(&bufr[0], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, parent, MPI_STATUS_IGNORE); + printf("Parent sent: %s\n", bufr[0]); + } else { + + /* original processes have to decide whether to participate */ + + /* In this test, each process launched by "mpirun -n " spawns a + * binary tree of processes. You end up with * ( 1 << max_depth ) + * processes altogether. For max_depth=4, this means 16*. There + * is potential here for heavy oversubscription, especially if in + * testing we launch tests with set to the number of available + * processors. This test tolerates oversubscription somewhat since + * it entails little inter-process synchronization. Nevertheless, + * we try to idle all but /4 of the original processes, using a + * minimum of at least two processes + */ + + int me, np; + + MPI_Comm_size(MPI_COMM_WORLD,&np); + MPI_Comm_rank(MPI_COMM_WORLD,&me); + + if ( np > 4 ) { + /* turn off all but every 4th process */ + if ( ( me & 3 ) != 0 ) participate = 0; + } else + if ( np > 2 ) { + /* turn off all but every 2nd process */ + if ( ( me & 1 ) != 0 ) participate = 0; + } + } + + /* all spawned processes and selected "root" processes participate */ + if ( participate ) { + printf("level = %d\n", level); + + /* prepare send buffer */ + sprintf(bufs,"level %d (pid:%d)", level, getpid()); + + /* spawn */ + if (level < max_depth) { + int i, nspawn = 2, errcodes[1]; + MPI_Request req[2]; + MPI_Comm comm[2]; + char argv1[NCHARS]; + char *args[2]; + + /* level 0 spawns only one process to mimic the original test */ + if ( level == 0 ) nspawn = 1; + + /* prepare command line arguments */ + snprintf(argv1, sizeof(argv1), "%d", level+1); + args[0] = argv1; + args[1] = NULL; + + /* spawn, with a message sent to and received from each child */ + for ( i = 0; i < nspawn; i++ ) { + MPI_Comm_spawn(argv[0], args, 1, MPI_INFO_NULL, 0, MPI_COMM_SELF, + &comm[i], errcodes); + MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, comm[i]); + MPI_Irecv(&bufr[i], sizeof(char)*NCHARS, MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, comm[i], &req[i]); + } + + /* wait for messages from children and print them */ + MPI_Waitall(nspawn, req, MPI_STATUSES_IGNORE); + for ( i = 0; i < nspawn; i++ ) + printf("Child %d sent: %s\n", i, bufr[i]); + } + + /* send message back to parent */ + if (MPI_COMM_NULL != parent) { + MPI_Send(&bufs, sizeof(char)*NCHARS, MPI_CHAR, 0, 100, parent); + } + } + + /* non-participating processes wait at this barrier for their peers */ + /* (This barrier won't cost that many CPU cycles.) */ + if (MPI_COMM_NULL == parent) { + nice_barrier(MPI_COMM_WORLD); + } + + MPI_Finalize(); + return 0; +} diff --git a/orte/util/attr.c b/orte/util/attr.c index 1f447f4a87c..a2d6ed48a7d 100644 --- a/orte/util/attr.c +++ b/orte/util/attr.c @@ -286,6 +286,8 @@ const char *orte_attr_key_to_str(orte_attribute_key_t key) return "ORTE_JOB_TRANSPORT_KEY"; case ORTE_JOB_INFO_CACHE: return "ORTE_JOB_INFO_CACHE"; + case ORTE_JOB_FULLY_DESCRIBED: + return "ORTE_JOB_FULLY_DESCRIBED"; case ORTE_PROC_NOBARRIER: return "PROC-NOBARRIER"; diff --git a/orte/util/attr.h b/orte/util/attr.h index 1b961030091..817581e38b6 100644 --- a/orte/util/attr.h +++ b/orte/util/attr.h @@ -143,6 +143,7 @@ typedef uint16_t orte_job_flags_t; #define ORTE_JOB_NOTIFY_COMPLETION (ORTE_JOB_START_KEY + 50) // bool - notify parent proc when spawned job terminates #define ORTE_JOB_TRANSPORT_KEY (ORTE_JOB_START_KEY + 51) // string - transport keys assigned to this job #define ORTE_JOB_INFO_CACHE (ORTE_JOB_START_KEY + 52) // opal_list_t - list of opal_value_t to be included in job_info +#define ORTE_JOB_FULLY_DESCRIBED (ORTE_JOB_START_KEY + 53) // bool - job is fully described in launch msg #define ORTE_JOB_MAX_KEY 300 diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index 3b2ec9bdfeb..ca4948fcbca 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -198,7 +198,7 @@ int orte_util_build_daemon_nidmap(void) return rc; } -int orte_util_nidmap_create(char **regex) +int orte_util_nidmap_create(opal_pointer_array_t *pool, char **regex) { char *node; char prefix[ORTE_MAX_NODE_PREFIX]; @@ -217,8 +217,8 @@ int orte_util_nidmap_create(char **regex) OBJ_CONSTRUCT(&dvpids, opal_list_t); rng = NULL; - for (n=0; n < orte_node_pool->size; n++) { - if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) { + for (n=0; n < pool->size; n++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(pool, n))) { continue; } /* if no daemon has been assigned, then this node is not being used */ @@ -1180,3 +1180,217 @@ int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer) OPAL_LIST_DESTRUCT(&flgs); return rc; } + +typedef struct { + opal_list_item_t super; + int ctx; + int nprocs; + int cnt; +} orte_nidmap_regex_t; +static void nrcon(orte_nidmap_regex_t *p) +{ + p->ctx = 0; + p->nprocs = -1; + p->cnt = 0; +} +static OBJ_CLASS_INSTANCE(orte_nidmap_regex_t, + opal_list_item_t, + nrcon, NULL); + +/* since not every node is involved in a job, we have to create a + * regex that indicates the ppn for every node, marking those that + * are not involved. Since each daemon knows the entire + * node pool, we simply provide a ppn for every daemon, with a -1 + * to indicate that the node is empty for that job */ +int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn) +{ + orte_nidmap_regex_t *prng, **actives; + opal_list_t *prk; + orte_node_t *nptr; + orte_proc_t *proc; + size_t n; + int *cnt, i, k; + char *tmp2, *ptmp, **cache = NULL; + + /* create an array of lists to handle the number of app_contexts in this job */ + prk = (opal_list_t*)malloc(jdata->num_apps * sizeof(opal_list_t)); + cnt = (int*)malloc(jdata->num_apps * sizeof(int)); + actives = (orte_nidmap_regex_t**)malloc(jdata->num_apps * sizeof(orte_nidmap_regex_t*)); + for (n=0; n < jdata->num_apps; n++) { + OBJ_CONSTRUCT(&prk[n], opal_list_t); + actives[n] = NULL; + } + + /* we provide a complete map in the regex, with an entry for every + * node in the pool */ + for (i=0; i < orte_node_pool->size; i++) { + if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { + continue; + } + /* if a daemon has been assigned, then count how many procs + * for each app_context from the specified job are assigned to this node */ + memset(cnt, 0, jdata->num_apps * sizeof(int)); + if (NULL != nptr->daemon) { + for (k=0; k < nptr->procs->size; k++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(nptr->procs, k))) { + if (proc->name.jobid == jdata->jobid) { + ++cnt[proc->app_idx]; + } + } + } + } + /* track the #procs on this node */ + for (n=0; n < jdata->num_apps; n++) { + if (NULL == actives[n]) { + /* just starting */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } else { + /* is this the next in line */ + if (cnt[n] == actives[n]->nprocs) { + actives[n]->cnt++; + } else { + /* need to start another range */ + actives[n] = OBJ_NEW(orte_nidmap_regex_t); + actives[n]->nprocs = cnt[n]; + actives[n]->cnt = 1; + opal_list_append(&prk[n], &actives[n]->super); + } + } + } + } + + /* construct the regex from the found ranges for each app_context */ + ptmp = NULL; + for (n=0; n < jdata->num_apps; n++) { + OPAL_LIST_FOREACH(prng, &prk[n], orte_nidmap_regex_t) { + if (1 < prng->cnt) { + if (NULL == ptmp) { + asprintf(&ptmp, "%u(%u)", prng->nprocs, prng->cnt); + } else { + asprintf(&tmp2, "%s,%u(%u)", ptmp, prng->nprocs, prng->cnt); + free(ptmp); + ptmp = tmp2; + } + } else { + if (NULL == ptmp) { + asprintf(&ptmp, "%u", prng->nprocs); + } else { + asprintf(&tmp2, "%s,%u", ptmp, prng->nprocs); + free(ptmp); + ptmp = tmp2; + } + } + } + OPAL_LIST_DESTRUCT(&prk[n]); // releases all the actives objects + opal_argv_append_nosize(&cache, ptmp); + free(ptmp); + ptmp = NULL; + } + free(prk); + free(cnt); + free(actives); + + *ppn = opal_argv_join(cache, '@'); + opal_argv_free(cache); + + return ORTE_SUCCESS; +} + +int orte_util_nidmap_parse_ppn(orte_job_t *jdata, char *regex) +{ + orte_node_t *node; + orte_proc_t *proc; + int n, k, m, cnt; + char **tmp, *ptr, **ppn; + orte_nidmap_regex_t *rng; + opal_list_t trk; + int rc = ORTE_SUCCESS; + + /* split the regex by app_context */ + tmp = opal_argv_split(regex, '@'); + + /* for each app_context, set the ppn */ + for (n=0; NULL != tmp[n]; n++) { + ppn = opal_argv_split(tmp[n], ','); + /* decompress the ppn */ + OBJ_CONSTRUCT(&trk, opal_list_t); + for (m=0; NULL != ppn[m]; m++) { + rng = OBJ_NEW(orte_nidmap_regex_t); + opal_list_append(&trk, &rng->super); + /* check for a count */ + if (NULL != (ptr = strchr(ppn[m], '('))) { + ppn[m][strlen(ppn[m])-1] = '\0'; // remove trailing paren + *ptr = '\0'; + ++ptr; + rng->cnt = strtoul(ptr, NULL, 10); + } else { + rng->cnt = 1; + } + /* convert the number */ + rng->nprocs = strtoul(ppn[m], NULL, 10); + } + opal_argv_free(ppn); + + /* cycle thru our node pool and add the indicated number of procs + * to each node */ + rng = (orte_nidmap_regex_t*)opal_list_get_first(&trk); + cnt = 0; + for (m=0; m < orte_node_pool->size; m++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, m))) { + continue; + } + /* see if it has any procs for this job and app_context */ + if (0 < rng->nprocs) { + /* add this node to the job map if it isn't already there */ + if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { + OBJ_RETAIN(node); + ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); + opal_pointer_array_add(jdata->map->nodes, node); + } + /* create a proc object for each one */ + for (k=0; k < rng->nprocs; k++) { + proc = OBJ_NEW(orte_proc_t); + proc->name.jobid = jdata->jobid; + /* leave the vpid undefined as this will be determined + * later when we do the overall ranking */ + proc->app_idx = n; + proc->parent = node->daemon->name.vpid; + OBJ_RETAIN(node); + proc->node = node; + /* flag the proc as ready for launch */ + proc->state = ORTE_PROC_STATE_INIT; + opal_pointer_array_add(node->procs, proc); + /* we will add the proc to the jdata array when we + * compute its rank */ + } + node->num_procs += rng->nprocs; + } + ++cnt; + if (rng->cnt <= cnt) { + rng = (orte_nidmap_regex_t*)opal_list_get_next(&rng->super); + if (NULL == rng) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + opal_argv_free(tmp); + rc = ORTE_ERR_NOT_FOUND; + goto complete; + } + cnt = 0; + } + } + OPAL_LIST_DESTRUCT(&trk); + } + opal_argv_free(tmp); + + complete: + /* reset any node map flags we used so the next job will start clean */ + for (n=0; n < jdata->map->nodes->size; n++) { + if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) { + ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); + } + } + + return rc; +} diff --git a/orte/util/nidmap.h b/orte/util/nidmap.h index 3acc29b9277..e8c6f59bc21 100644 --- a/orte/util/nidmap.h +++ b/orte/util/nidmap.h @@ -46,7 +46,7 @@ BEGIN_C_DECLS ORTE_DECLSPEC void orte_util_nidmap_init(void); -ORTE_DECLSPEC int orte_util_nidmap_create(char **regex); +ORTE_DECLSPEC int orte_util_nidmap_create(opal_pointer_array_t *pool, char **regex); ORTE_DECLSPEC int orte_util_nidmap_parse(char *regex); /* create a regular expression describing the nodes in the @@ -59,6 +59,12 @@ ORTE_DECLSPEC int orte_util_decode_daemon_nodemap(opal_buffer_t *buffer); ORTE_DECLSPEC int orte_util_build_daemon_nidmap(void); +/* create a regular expression describing the ppn for a job */ +ORTE_DECLSPEC int orte_util_nidmap_generate_ppn(orte_job_t *jdata, char **ppn); + +/* decode the ppn */ +ORTE_DECLSPEC int orte_util_nidmap_parse_ppn(orte_job_t *jdata, char *ppn); + END_C_DECLS #endif