From 3a2d6a5ab6101c123517a691f0045fe458d813e7 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 28 Dec 2016 09:14:26 -0800 Subject: [PATCH] Begin to reduce reliance of application procs on the topology tree itself by having the daemon provide more detailed info. In this case, provide the topology description string so that procs can readily determine the number of types of objects on the node, and a "locality" string that describes which objects this process is executing upon. The latter allows a process to compute the objects of overlap between itself and another proc without consulting the topology tree. Signed-off-by: Ralph Castain --- opal/mca/btl/sm/btl_sm.c | 110 ++++++--- opal/mca/hwloc/base/base.h | 12 +- opal/mca/hwloc/base/hwloc_base_util.c | 254 ++++++++++++++++++++- opal/mca/pmix/pmix_types.h | 2 + orte/mca/ess/pmi/ess_pmi_module.c | 61 +++-- orte/mca/rmaps/base/rmaps_base_map_job.c | 0 orte/orted/pmix/pmix_server_register_fns.c | 120 +++------- 7 files changed, 401 insertions(+), 158 deletions(-) mode change 100755 => 100644 orte/mca/rmaps/base/rmaps_base_map_job.c diff --git a/opal/mca/btl/sm/btl_sm.c b/opal/mca/btl/sm/btl_sm.c index d5a8d31e0ae..4bc07d9a2d8 100644 --- a/opal/mca/btl/sm/btl_sm.c +++ b/opal/mca/btl/sm/btl_sm.c @@ -16,7 +16,7 @@ * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 ARM, Inc. All rights reserved. @@ -52,6 +52,7 @@ #include "opal/util/show_help.h" #include "opal/util/printf.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/pmix/pmix.h" #include "opal/mca/shmem/base/base.h" #include "opal/mca/shmem/shmem.h" @@ -223,23 +224,28 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int my_mem_node, num_mem_nodes, i, rc; mca_common_sm_mpool_resources_t *res = NULL; mca_btl_sm_component_t* m = &mca_btl_sm_component; + char *loc, *mynuma; + opal_process_name_t wildcard_rank; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1; - /* If we have hwloc support, then get accurate information */ - if (NULL != opal_hwloc_topology) { - i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE, 0, - OPAL_HWLOC_AVAILABLE); - - /* If we find >0 NUMA nodes, then investigate further */ - if (i > 0) { - int numa=0, w; - unsigned n_bound=0; - hwloc_cpuset_t avail; - hwloc_obj_t obj; + /* see if we were given a topology signature */ + wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid; + wildcard_rank.vpid = OPAL_VPID_WILDCARD; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE, + &wildcard_rank, &loc, OPAL_STRING); + if (OPAL_SUCCESS == rc) { + /* the number of NUMA nodes is right at the front */ + mca_btl_sm_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10); + free(loc); + } else { + /* If we have hwloc support, then get accurate information */ + if (NULL != opal_hwloc_topology) { + i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, + HWLOC_OBJ_NODE, 0, + OPAL_HWLOC_AVAILABLE); /* JMS This tells me how many numa nodes are *available*, but it's not how many are being used *by this job*. @@ -248,33 +254,65 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, should be improved to be how many NUMA nodes are being used *in this job*. */ mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i; + } + } + /* see if we were given our location */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING, + &OPAL_PROC_MY_NAME, &loc, OPAL_STRING); + if (OPAL_SUCCESS == rc) { + if (NULL == loc) { + mca_btl_sm_component.mem_node = my_mem_node = -1; + } else { + /* get our NUMA location */ + mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0); + if (NULL == mynuma || + NULL != strchr(mynuma, ',') || + NULL != strchr(mynuma, '-')) { + /* we either have no idea what NUMA we are on, or we + * are on multiple NUMA nodes */ + mca_btl_sm_component.mem_node = my_mem_node = -1; + } else { + /* we are bound to a single NUMA node */ + my_mem_node = strtoul(mynuma, NULL, 10); + mca_btl_sm_component.mem_node = my_mem_node; + } + if (NULL != mynuma) { + free(mynuma); + } + free(loc); + } + } else { + /* If we have hwloc support, then get accurate information */ + if (NULL != opal_hwloc_topology && num_mem_nodes > 0 && + NULL != opal_process_info.cpuset) { + int numa=0, w; + unsigned n_bound=0; + hwloc_cpuset_t avail; + hwloc_obj_t obj; - /* if we are not bound, then there is nothing further to do */ - if (NULL != opal_process_info.cpuset) { - /* count the number of NUMA nodes to which we are bound */ - for (w=0; w < i; w++) { - if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE, 0, w, - OPAL_HWLOC_AVAILABLE))) { - continue; - } - /* get that NUMA node's available cpus */ - avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); - /* see if we intersect */ - if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { - n_bound++; - numa = w; - } + /* count the number of NUMA nodes to which we are bound */ + for (w=0; w < i; w++) { + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, + HWLOC_OBJ_NODE, 0, w, + OPAL_HWLOC_AVAILABLE))) { + continue; } - /* if we are located on more than one NUMA, or we didn't find - * a NUMA we are on, then not much we can do - */ - if (1 == n_bound) { - mca_btl_sm_component.mem_node = my_mem_node = numa; - } else { - mca_btl_sm_component.mem_node = my_mem_node = -1; + /* get that NUMA node's available cpus */ + avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); + /* see if we intersect */ + if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { + n_bound++; + numa = w; } } + /* if we are located on more than one NUMA, or we didn't find + * a NUMA we are on, then not much we can do + */ + if (1 == n_bound) { + mca_btl_sm_component.mem_node = my_mem_node = numa; + } else { + mca_btl_sm_component.mem_node = my_mem_node = -1; + } } } diff --git a/opal/mca/hwloc/base/base.h b/opal/mca/hwloc/base/base.h index 826aeb81a84..df3cf7dc25e 100644 --- a/opal/mca/hwloc/base/base.h +++ b/opal/mca/hwloc/base/base.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -276,6 +276,16 @@ OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, OPAL_DECLSPEC char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo); +/* get a string describing the locality of a given process */ +OPAL_DECLSPEC char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap); + +/* extract a location from the locality string */ +OPAL_DECLSPEC char* opal_hwloc_base_get_location(char *locality, + hwloc_obj_type_t type, + unsigned index); + +OPAL_DECLSPEC opal_hwloc_locality_t opal_hwloc_compute_relative_locality(char *loc1, char *loc2); + END_C_DECLS #endif /* OPAL_HWLOC_BASE_H */ diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 040e531352f..812435ee6d1 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -13,7 +13,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -1502,9 +1502,9 @@ static char *hwloc_getline(FILE *fp) ret = fgets(input, OPAL_HWLOC_MAX_ELOG_LINE, fp); if (NULL != ret) { - input[strlen(input)-1] = '\0'; /* remove newline */ - buff = strdup(input); - return buff; + input[strlen(input)-1] = '\0'; /* remove newline */ + buff = strdup(input); + return buff; } return NULL; @@ -2128,3 +2128,249 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo) } return sig; } + +char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo, + char *bitmap) +{ + hwloc_obj_t obj; + char *locality=NULL, *tmp, *t2; + unsigned depth, d, width, w; + hwloc_cpuset_t cpuset, avail, result; + hwloc_obj_type_t type; + + /* if this proc is not bound, then there is no locality. We + * know it isn't bound if the cpuset is NULL, or if it is + * all 1's */ + if (NULL == bitmap) { + return NULL; + } + cpuset = hwloc_bitmap_alloc(); + hwloc_bitmap_list_sscanf(cpuset, bitmap); + if (hwloc_bitmap_isfull(cpuset)) { + hwloc_bitmap_free(cpuset); + return NULL; + } + + /* we are going to use a bitmap to save the results so + * that we can use a hwloc utility to print them */ + result = hwloc_bitmap_alloc(); + + /* get the max depth of the topology */ + depth = hwloc_topology_get_depth(topo); + + /* start at the first depth below the top machine level */ + for (d=1; d < depth; d++) { + /* get the object type at this depth */ + type = hwloc_get_depth_type(topo, d); + /* if it isn't one of interest, then ignore it */ + if (HWLOC_OBJ_NODE != type && + HWLOC_OBJ_SOCKET != type && + HWLOC_OBJ_CACHE != type && + HWLOC_OBJ_CORE != type && + HWLOC_OBJ_PU != type) { + continue; + } + + /* get the width of the topology at this depth */ + width = hwloc_get_nbobjs_by_depth(topo, d); + + /* scan all objects at this depth to see if + * the location overlaps with them + */ + for (w=0; w < width; w++) { + /* get the object at this depth/index */ + obj = hwloc_get_obj_by_depth(topo, d, w); + /* get the available cpuset for this obj */ + avail = opal_hwloc_base_get_available_cpus(topo, obj); + /* see if the location intersects with it */ + if (hwloc_bitmap_intersects(avail, cpuset)) { + hwloc_bitmap_set(result, w); + } + } + /* it should be impossible, but allow for the possibility + * that we came up empty at this depth */ + if (!hwloc_bitmap_iszero(result)) { + hwloc_bitmap_list_asprintf(&tmp, result); + switch(obj->type) { + case HWLOC_OBJ_NODE: + asprintf(&t2, "%sNM%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + case HWLOC_OBJ_SOCKET: + asprintf(&t2, "%sSK%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + case HWLOC_OBJ_CACHE: + if (3 == obj->attr->cache.depth) { + asprintf(&t2, "%sL3%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + } else if (2 == obj->attr->cache.depth) { + asprintf(&t2, "%sL2%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + } else { + asprintf(&t2, "%sL1%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + } + break; + case HWLOC_OBJ_CORE: + asprintf(&t2, "%sCR%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + case HWLOC_OBJ_PU: + asprintf(&t2, "%sHT%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + default: + /* just ignore it */ + break; + } + free(tmp); + } + hwloc_bitmap_zero(result); + } + hwloc_bitmap_free(result); + hwloc_bitmap_free(cpuset); + + /* remove the trailing colon */ + if (NULL != locality) { + locality[strlen(locality)-1] = '\0'; + } + return locality; +} + +char* opal_hwloc_base_get_location(char *locality, + hwloc_obj_type_t type, + unsigned index) +{ + char **loc; + char *srch, *ans = NULL; + size_t n; + + if (NULL == locality) { + return NULL; + } + switch(type) { + case HWLOC_OBJ_NODE: + srch = "NM"; + break; + case HWLOC_OBJ_SOCKET: + srch = "SK"; + break; + case HWLOC_OBJ_CACHE: + if (3 == index) { + srch = "L3"; + } else if (2 == index) { + srch = "L2"; + } else { + srch = "L0"; + } + break; + case HWLOC_OBJ_CORE: + srch = "CR"; + break; + case HWLOC_OBJ_PU: + srch = "HT"; + break; + default: + return NULL; + } + loc = opal_argv_split(locality, ':'); + for (n=0; NULL != loc[n]; n++) { + if (0 == strncmp(loc[n], srch, 2)) { + ans = strdup(&loc[n][2]); + break; + } + } + opal_argv_free(loc); + + return ans; +} + +opal_hwloc_locality_t opal_hwloc_compute_relative_locality(char *loc1, char *loc2) +{ + opal_hwloc_locality_t locality; + char **set1, **set2; + hwloc_bitmap_t bit1, bit2; + size_t n1, n2; + + /* start with what we know - they share a node on a cluster + * NOTE: we may alter that latter part as hwloc's ability to + * sense multi-cu, multi-cluster systems grows + */ + locality = OPAL_PROC_ON_NODE | OPAL_PROC_ON_HOST | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER; + + /* if either location is NULL, then that isn't bound */ + if (NULL == loc1 || NULL == loc2) { + return locality; + } + + set1 = opal_argv_split(loc1, ':'); + set2 = opal_argv_split(loc2, ':'); + bit1 = hwloc_bitmap_alloc(); + bit2 = hwloc_bitmap_alloc(); + + /* check each matching type */ + for (n1=0; NULL != set1[n1]; n1++) { + /* convert the location into bitmap */ + hwloc_bitmap_list_sscanf(bit1, &set1[n1][2]); + /* find the matching type in set2 */ + for (n2=0; NULL != set2[n2]; n2++) { + if (0 == strncmp(set1[n1], set2[n2], 2)) { + /* convert the location into bitmap */ + hwloc_bitmap_list_sscanf(bit2, &set2[n2][2]); + /* see if they intersect */ + if (hwloc_bitmap_intersects(bit1, bit2)) { + /* set the corresponding locality bit */ + if (0 == strncmp(set1[n1], "NM", 2)) { + locality |= OPAL_PROC_ON_NUMA; + } else if (0 == strncmp(set1[n1], "SK", 2)) { + locality |= OPAL_PROC_ON_SOCKET; + } else if (0 == strncmp(set1[n1], "L3", 2)) { + locality |= OPAL_PROC_ON_L3CACHE; + } else if (0 == strncmp(set1[n1], "L2", 2)) { + locality |= OPAL_PROC_ON_L2CACHE; + } else if (0 == strncmp(set1[n1], "L1", 2)) { + locality |= OPAL_PROC_ON_L1CACHE; + } else if (0 == strncmp(set1[n1], "CR", 2)) { + locality |= OPAL_PROC_ON_CORE; + } else if (0 == strncmp(set1[n1], "HT", 2)) { + locality |= OPAL_PROC_ON_HWTHREAD; + } else { + /* should never happen */ + opal_output(0, "UNRECOGNIZED LOCALITY %s", set1[n1]); + } + } + break; + } + } + } + opal_argv_free(set1); + opal_argv_free(set2); + hwloc_bitmap_free(bit1); + hwloc_bitmap_free(bit2); + return locality; +} diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index 30def74f67f..6e2c9ab2cc7 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -104,6 +104,8 @@ BEGIN_C_DECLS /**** no PMIx equivalent ****/ #define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs +#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string +#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location #define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace #define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index addb2b67526..d8780d99bc9 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -94,7 +94,7 @@ static int rte_init(void) char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; - char **peers=NULL, *mycpuset, **cpusets=NULL; + char **peers=NULL, *mycpuset; opal_process_name_t wildcard_rank, pname; bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false; size_t i; @@ -248,7 +248,7 @@ static int rte_init(void) /* retrieve temp directories info */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { - /* We want to provide user with ability + /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.top_session_dir ){ @@ -264,7 +264,7 @@ static int rte_init(void) if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { - /* We want to provide user with ability + /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.job_session_dir ){ @@ -281,7 +281,7 @@ static int rte_init(void) if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { - /* We want to provide user with ability + /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.proc_session_dir ){ @@ -385,65 +385,64 @@ static int rte_init(void) if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); - /* and their cpusets, if available */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, - &wildcard_rank, &val, OPAL_STRING); - if (OPAL_SUCCESS == ret && NULL != val) { - cpusets = opal_argv_split(val, ':'); - free(val); - } else { - cpusets = NULL; - } } else { peers = NULL; - cpusets = NULL; } } else { peers = NULL; - cpusets = NULL; } /* set the locality */ if (NULL != peers) { - /* indentify our cpuset */ - if (NULL != cpusets) { - mycpuset = cpusets[orte_process_info.my_local_rank]; + /* identify our location */ + val = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + ORTE_PROC_MY_NAME, &val, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != val) { + mycpuset = val; } else { mycpuset = NULL; } pname.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; NULL != peers[i]; i++) { - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCALITY); - kv->type = OPAL_UINT16; pname.vpid = strtoul(peers[i], NULL, 10); if (pname.vpid == ORTE_PROC_MY_NAME->vpid) { /* we are fully local to ourselves */ u16 = OPAL_PROC_ALL_LOCAL; - } else if (NULL == mycpuset || NULL == cpusets[i] || - 0 == strcmp(cpusets[i], "UNBOUND")) { - /* all we can say is that it shares our node */ - u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { - /* we have it, so compute the locality */ - u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]); + val = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + &pname, &val, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != val) { + u16 = opal_hwloc_compute_relative_locality(mycpuset, val); + } else { + /* all we can say is that it shares our node */ + u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + } } + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCALITY); + kv->type = OPAL_UINT16; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, - "%s ess:pmi:locality: proc %s locality %x", + "%s ess:pmi:locality: proc %s locality %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pname), u16)); + ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(u16))); kv->data.uint16 = u16; ret = opal_pmix.store_local(&pname, kv); if (OPAL_SUCCESS != ret) { error = "local store of locality"; opal_argv_free(peers); - opal_argv_free(cpusets); + if (NULL != mycpuset) { + free(mycpuset); + } goto error; } OBJ_RELEASE(kv); } opal_argv_free(peers); - opal_argv_free(cpusets); + if (NULL != mycpuset) { + free(mycpuset); + } } /* now that we have all required info, complete the setup */ diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c old mode 100755 new mode 100644 diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index 7502221ba76..c343b4a17ea 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -38,6 +38,7 @@ #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/util/error.h" +#include "opal/mca/hwloc/base/base.h" #include "opal/mca/pmix/pmix.h" #include "orte/util/name_fns.h" @@ -59,7 +60,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) opal_value_t *kv; orte_node_t *node, *mynode; opal_vpid_t vpid; - char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist; + char **list, **procs, **micro, *tmp, *regex; orte_job_t *dmns; orte_job_map_t *map; orte_app_context_t *app; @@ -239,13 +240,22 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.uint32 = jdata->total_slots_alloc; opal_list_append(info, &kv->super); + /* topology signature */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_TOPOLOGY_SIGNATURE); + kv->type = OPAL_STRING; + kv->data.string = strdup(orte_topo_signature); + opal_list_append(info, &kv->super); + /* register any local clients */ vpid = ORTE_VPID_MAX; + micro = NULL; for (i=0; i < mynode->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) { continue; } if (pptr->name.jobid == jdata->jobid) { + opal_argv_append_nosize(µ, ORTE_VPID_PRINT(pptr->name.vpid)); if (pptr->name.vpid < vpid) { vpid = pptr->name.vpid; } @@ -256,6 +266,16 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) } } } + if (NULL != micro) { + /* pass the local peers */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); + kv->type = OPAL_STRING; + kv->data.string = opal_argv_join(micro, ','); + opal_argv_free(micro); + opal_list_append(info, &kv->super); + } + /* pass the local ldr */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCALLDR); @@ -274,71 +294,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { continue; } - /* construct the list of local peers, while adding - * each proc's locality info */ - list = NULL; - procs = NULL; - cpulist = NULL; - peerlist = NULL; - vpid = ORTE_VPID_MAX; - for (i=0; i < node->procs->size; i++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (pptr->name.jobid == jdata->jobid) { - opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid)); - if (pptr->name.vpid < vpid) { - vpid = pptr->name.vpid; - } - /* note that we have to pass the cpuset for each local - * peer so locality can be computed */ - tmp = NULL; - if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) { - if (NULL != tmp) { - opal_argv_append_nosize(&procs, tmp); - free(tmp); - } else { - opal_argv_append_nosize(&procs, "UNBOUND"); - } - } else { - opal_argv_append_nosize(&procs, "UNBOUND"); - } - } - } - /* construct the list of peers for transmission */ - if (NULL != list) { - peerlist = opal_argv_join(list, ','); - opal_argv_free(list); - list = NULL; - } - /* construct the list of cpusets for transmission */ - if (NULL != procs) { - cpulist = opal_argv_join(procs, ':'); - opal_argv_free(procs); - procs = NULL; - } - - /* if this is me, then pass the peers and cpusets to myself - * in order to maintain backward compatibility for the non-pmix - * components in OPAL/pmix */ - if (node == mynode) { - /* pass the list of peers */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); - kv->type = OPAL_STRING; - kv->data.string = strdup(peerlist); - opal_list_append(info, &kv->super); - - /* pass the list of cpusets */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS); - kv->type = OPAL_STRING; - kv->data.string = strdup(cpulist); - opal_list_append(info, &kv->super); - - } - - /* now cycle across each proc on this node, passing all data that + /* cycle across each proc on this node, passing all data that * varies by proc */ for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { @@ -363,19 +319,18 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.name.vpid = pptr->name.vpid; opal_list_append(pmap, &kv->super); - /* pass the list of peers */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); - kv->type = OPAL_STRING; - kv->data.string = strdup(peerlist); - opal_list_append(pmap, &kv->super); - - /* pass the list of cpusets */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS); - kv->type = OPAL_STRING; - kv->data.string = strdup(cpulist); - opal_list_append(pmap, &kv->super); + /* location, for local procs */ + if (node == mynode) { + if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) { + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCALITY_STRING); + kv->type = OPAL_STRING; + kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp); + opal_output(0, "PROC %s LOCALITY %s", ORTE_NAME_PRINT(&pptr->name), kv->data.string); + opal_list_append(pmap, &kv->super); + free(tmp); + } + } /* appnum */ kv = OBJ_NEW(opal_value_t); @@ -441,13 +396,6 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.uint32 = pptr->node->index; opal_list_append(pmap, &kv->super); } - /* cleanup */ - if (NULL != cpulist) { - free(cpulist); - } - if (NULL != peerlist) { - free(peerlist); - } } /* mark the job as registered */