diff --git a/opal/mca/btl/sm/btl_sm.c b/opal/mca/btl/sm/btl_sm.c index d5a8d31e0ae..4bc07d9a2d8 100644 --- a/opal/mca/btl/sm/btl_sm.c +++ b/opal/mca/btl/sm/btl_sm.c @@ -16,7 +16,7 @@ * All rights reserved. * Copyright (c) 2010-2012 IBM Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 ARM, Inc. All rights reserved. @@ -52,6 +52,7 @@ #include "opal/util/show_help.h" #include "opal/util/printf.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/pmix/pmix.h" #include "opal/mca/shmem/base/base.h" #include "opal/mca/shmem/shmem.h" @@ -223,23 +224,28 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int my_mem_node, num_mem_nodes, i, rc; mca_common_sm_mpool_resources_t *res = NULL; mca_btl_sm_component_t* m = &mca_btl_sm_component; + char *loc, *mynuma; + opal_process_name_t wildcard_rank; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1; - /* If we have hwloc support, then get accurate information */ - if (NULL != opal_hwloc_topology) { - i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE, 0, - OPAL_HWLOC_AVAILABLE); - - /* If we find >0 NUMA nodes, then investigate further */ - if (i > 0) { - int numa=0, w; - unsigned n_bound=0; - hwloc_cpuset_t avail; - hwloc_obj_t obj; + /* see if we were given a topology signature */ + wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid; + wildcard_rank.vpid = OPAL_VPID_WILDCARD; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE, + &wildcard_rank, &loc, OPAL_STRING); + if (OPAL_SUCCESS == rc) { + /* the number of NUMA nodes is right at the front */ + mca_btl_sm_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10); + free(loc); + } else { + /* If we have hwloc support, then get accurate information */ + if (NULL != opal_hwloc_topology) { + i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, + HWLOC_OBJ_NODE, 0, + OPAL_HWLOC_AVAILABLE); /* JMS This tells me how many numa nodes are *available*, but it's not how many are being used *by this job*. @@ -248,33 +254,65 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, should be improved to be how many NUMA nodes are being used *in this job*. */ mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i; + } + } + /* see if we were given our location */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING, + &OPAL_PROC_MY_NAME, &loc, OPAL_STRING); + if (OPAL_SUCCESS == rc) { + if (NULL == loc) { + mca_btl_sm_component.mem_node = my_mem_node = -1; + } else { + /* get our NUMA location */ + mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0); + if (NULL == mynuma || + NULL != strchr(mynuma, ',') || + NULL != strchr(mynuma, '-')) { + /* we either have no idea what NUMA we are on, or we + * are on multiple NUMA nodes */ + mca_btl_sm_component.mem_node = my_mem_node = -1; + } else { + /* we are bound to a single NUMA node */ + my_mem_node = strtoul(mynuma, NULL, 10); + mca_btl_sm_component.mem_node = my_mem_node; + } + if (NULL != mynuma) { + free(mynuma); + } + free(loc); + } + } else { + /* If we have hwloc support, then get accurate information */ + if (NULL != opal_hwloc_topology && num_mem_nodes > 0 && + NULL != opal_process_info.cpuset) { + int numa=0, w; + unsigned n_bound=0; + hwloc_cpuset_t avail; + hwloc_obj_t obj; - /* if we are not bound, then there is nothing further to do */ - if (NULL != opal_process_info.cpuset) { - /* count the number of NUMA nodes to which we are bound */ - for (w=0; w < i; w++) { - if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE, 0, w, - OPAL_HWLOC_AVAILABLE))) { - continue; - } - /* get that NUMA node's available cpus */ - avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); - /* see if we intersect */ - if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { - n_bound++; - numa = w; - } + /* count the number of NUMA nodes to which we are bound */ + for (w=0; w < i; w++) { + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, + HWLOC_OBJ_NODE, 0, w, + OPAL_HWLOC_AVAILABLE))) { + continue; } - /* if we are located on more than one NUMA, or we didn't find - * a NUMA we are on, then not much we can do - */ - if (1 == n_bound) { - mca_btl_sm_component.mem_node = my_mem_node = numa; - } else { - mca_btl_sm_component.mem_node = my_mem_node = -1; + /* get that NUMA node's available cpus */ + avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); + /* see if we intersect */ + if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { + n_bound++; + numa = w; } } + /* if we are located on more than one NUMA, or we didn't find + * a NUMA we are on, then not much we can do + */ + if (1 == n_bound) { + mca_btl_sm_component.mem_node = my_mem_node = numa; + } else { + mca_btl_sm_component.mem_node = my_mem_node = -1; + } } } diff --git a/opal/mca/hwloc/base/base.h b/opal/mca/hwloc/base/base.h index 826aeb81a84..df3cf7dc25e 100644 --- a/opal/mca/hwloc/base/base.h +++ b/opal/mca/hwloc/base/base.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -276,6 +276,16 @@ OPAL_DECLSPEC hwloc_obj_t opal_hwloc_base_get_pu(hwloc_topology_t topo, OPAL_DECLSPEC char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo); +/* get a string describing the locality of a given process */ +OPAL_DECLSPEC char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo, char *bitmap); + +/* extract a location from the locality string */ +OPAL_DECLSPEC char* opal_hwloc_base_get_location(char *locality, + hwloc_obj_type_t type, + unsigned index); + +OPAL_DECLSPEC opal_hwloc_locality_t opal_hwloc_compute_relative_locality(char *loc1, char *loc2); + END_C_DECLS #endif /* OPAL_HWLOC_BASE_H */ diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 040e531352f..812435ee6d1 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -13,7 +13,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -1502,9 +1502,9 @@ static char *hwloc_getline(FILE *fp) ret = fgets(input, OPAL_HWLOC_MAX_ELOG_LINE, fp); if (NULL != ret) { - input[strlen(input)-1] = '\0'; /* remove newline */ - buff = strdup(input); - return buff; + input[strlen(input)-1] = '\0'; /* remove newline */ + buff = strdup(input); + return buff; } return NULL; @@ -2128,3 +2128,249 @@ char* opal_hwloc_base_get_topo_signature(hwloc_topology_t topo) } return sig; } + +char* opal_hwloc_base_get_locality_string(hwloc_topology_t topo, + char *bitmap) +{ + hwloc_obj_t obj; + char *locality=NULL, *tmp, *t2; + unsigned depth, d, width, w; + hwloc_cpuset_t cpuset, avail, result; + hwloc_obj_type_t type; + + /* if this proc is not bound, then there is no locality. We + * know it isn't bound if the cpuset is NULL, or if it is + * all 1's */ + if (NULL == bitmap) { + return NULL; + } + cpuset = hwloc_bitmap_alloc(); + hwloc_bitmap_list_sscanf(cpuset, bitmap); + if (hwloc_bitmap_isfull(cpuset)) { + hwloc_bitmap_free(cpuset); + return NULL; + } + + /* we are going to use a bitmap to save the results so + * that we can use a hwloc utility to print them */ + result = hwloc_bitmap_alloc(); + + /* get the max depth of the topology */ + depth = hwloc_topology_get_depth(topo); + + /* start at the first depth below the top machine level */ + for (d=1; d < depth; d++) { + /* get the object type at this depth */ + type = hwloc_get_depth_type(topo, d); + /* if it isn't one of interest, then ignore it */ + if (HWLOC_OBJ_NODE != type && + HWLOC_OBJ_SOCKET != type && + HWLOC_OBJ_CACHE != type && + HWLOC_OBJ_CORE != type && + HWLOC_OBJ_PU != type) { + continue; + } + + /* get the width of the topology at this depth */ + width = hwloc_get_nbobjs_by_depth(topo, d); + + /* scan all objects at this depth to see if + * the location overlaps with them + */ + for (w=0; w < width; w++) { + /* get the object at this depth/index */ + obj = hwloc_get_obj_by_depth(topo, d, w); + /* get the available cpuset for this obj */ + avail = opal_hwloc_base_get_available_cpus(topo, obj); + /* see if the location intersects with it */ + if (hwloc_bitmap_intersects(avail, cpuset)) { + hwloc_bitmap_set(result, w); + } + } + /* it should be impossible, but allow for the possibility + * that we came up empty at this depth */ + if (!hwloc_bitmap_iszero(result)) { + hwloc_bitmap_list_asprintf(&tmp, result); + switch(obj->type) { + case HWLOC_OBJ_NODE: + asprintf(&t2, "%sNM%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + case HWLOC_OBJ_SOCKET: + asprintf(&t2, "%sSK%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + case HWLOC_OBJ_CACHE: + if (3 == obj->attr->cache.depth) { + asprintf(&t2, "%sL3%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + } else if (2 == obj->attr->cache.depth) { + asprintf(&t2, "%sL2%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + } else { + asprintf(&t2, "%sL1%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + } + break; + case HWLOC_OBJ_CORE: + asprintf(&t2, "%sCR%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + case HWLOC_OBJ_PU: + asprintf(&t2, "%sHT%s:", (NULL == locality) ? "" : locality, tmp); + if (NULL != locality) { + free(locality); + } + locality = t2; + break; + default: + /* just ignore it */ + break; + } + free(tmp); + } + hwloc_bitmap_zero(result); + } + hwloc_bitmap_free(result); + hwloc_bitmap_free(cpuset); + + /* remove the trailing colon */ + if (NULL != locality) { + locality[strlen(locality)-1] = '\0'; + } + return locality; +} + +char* opal_hwloc_base_get_location(char *locality, + hwloc_obj_type_t type, + unsigned index) +{ + char **loc; + char *srch, *ans = NULL; + size_t n; + + if (NULL == locality) { + return NULL; + } + switch(type) { + case HWLOC_OBJ_NODE: + srch = "NM"; + break; + case HWLOC_OBJ_SOCKET: + srch = "SK"; + break; + case HWLOC_OBJ_CACHE: + if (3 == index) { + srch = "L3"; + } else if (2 == index) { + srch = "L2"; + } else { + srch = "L0"; + } + break; + case HWLOC_OBJ_CORE: + srch = "CR"; + break; + case HWLOC_OBJ_PU: + srch = "HT"; + break; + default: + return NULL; + } + loc = opal_argv_split(locality, ':'); + for (n=0; NULL != loc[n]; n++) { + if (0 == strncmp(loc[n], srch, 2)) { + ans = strdup(&loc[n][2]); + break; + } + } + opal_argv_free(loc); + + return ans; +} + +opal_hwloc_locality_t opal_hwloc_compute_relative_locality(char *loc1, char *loc2) +{ + opal_hwloc_locality_t locality; + char **set1, **set2; + hwloc_bitmap_t bit1, bit2; + size_t n1, n2; + + /* start with what we know - they share a node on a cluster + * NOTE: we may alter that latter part as hwloc's ability to + * sense multi-cu, multi-cluster systems grows + */ + locality = OPAL_PROC_ON_NODE | OPAL_PROC_ON_HOST | OPAL_PROC_ON_CU | OPAL_PROC_ON_CLUSTER; + + /* if either location is NULL, then that isn't bound */ + if (NULL == loc1 || NULL == loc2) { + return locality; + } + + set1 = opal_argv_split(loc1, ':'); + set2 = opal_argv_split(loc2, ':'); + bit1 = hwloc_bitmap_alloc(); + bit2 = hwloc_bitmap_alloc(); + + /* check each matching type */ + for (n1=0; NULL != set1[n1]; n1++) { + /* convert the location into bitmap */ + hwloc_bitmap_list_sscanf(bit1, &set1[n1][2]); + /* find the matching type in set2 */ + for (n2=0; NULL != set2[n2]; n2++) { + if (0 == strncmp(set1[n1], set2[n2], 2)) { + /* convert the location into bitmap */ + hwloc_bitmap_list_sscanf(bit2, &set2[n2][2]); + /* see if they intersect */ + if (hwloc_bitmap_intersects(bit1, bit2)) { + /* set the corresponding locality bit */ + if (0 == strncmp(set1[n1], "NM", 2)) { + locality |= OPAL_PROC_ON_NUMA; + } else if (0 == strncmp(set1[n1], "SK", 2)) { + locality |= OPAL_PROC_ON_SOCKET; + } else if (0 == strncmp(set1[n1], "L3", 2)) { + locality |= OPAL_PROC_ON_L3CACHE; + } else if (0 == strncmp(set1[n1], "L2", 2)) { + locality |= OPAL_PROC_ON_L2CACHE; + } else if (0 == strncmp(set1[n1], "L1", 2)) { + locality |= OPAL_PROC_ON_L1CACHE; + } else if (0 == strncmp(set1[n1], "CR", 2)) { + locality |= OPAL_PROC_ON_CORE; + } else if (0 == strncmp(set1[n1], "HT", 2)) { + locality |= OPAL_PROC_ON_HWTHREAD; + } else { + /* should never happen */ + opal_output(0, "UNRECOGNIZED LOCALITY %s", set1[n1]); + } + } + break; + } + } + } + opal_argv_free(set1); + opal_argv_free(set2); + hwloc_bitmap_free(bit1); + hwloc_bitmap_free(bit2); + return locality; +} diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index 30def74f67f..6e2c9ab2cc7 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -104,6 +104,8 @@ BEGIN_C_DECLS /**** no PMIx equivalent ****/ #define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs +#define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string +#define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location #define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace #define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index addb2b67526..d8780d99bc9 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -94,7 +94,7 @@ static int rte_init(void) char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; - char **peers=NULL, *mycpuset, **cpusets=NULL; + char **peers=NULL, *mycpuset; opal_process_name_t wildcard_rank, pname; bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false; size_t i; @@ -248,7 +248,7 @@ static int rte_init(void) /* retrieve temp directories info */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { - /* We want to provide user with ability + /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.top_session_dir ){ @@ -264,7 +264,7 @@ static int rte_init(void) if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { - /* We want to provide user with ability + /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.job_session_dir ){ @@ -281,7 +281,7 @@ static int rte_init(void) if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { - /* We want to provide user with ability + /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.proc_session_dir ){ @@ -385,65 +385,64 @@ static int rte_init(void) if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); - /* and their cpusets, if available */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, - &wildcard_rank, &val, OPAL_STRING); - if (OPAL_SUCCESS == ret && NULL != val) { - cpusets = opal_argv_split(val, ':'); - free(val); - } else { - cpusets = NULL; - } } else { peers = NULL; - cpusets = NULL; } } else { peers = NULL; - cpusets = NULL; } /* set the locality */ if (NULL != peers) { - /* indentify our cpuset */ - if (NULL != cpusets) { - mycpuset = cpusets[orte_process_info.my_local_rank]; + /* identify our location */ + val = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + ORTE_PROC_MY_NAME, &val, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != val) { + mycpuset = val; } else { mycpuset = NULL; } pname.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; NULL != peers[i]; i++) { - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCALITY); - kv->type = OPAL_UINT16; pname.vpid = strtoul(peers[i], NULL, 10); if (pname.vpid == ORTE_PROC_MY_NAME->vpid) { /* we are fully local to ourselves */ u16 = OPAL_PROC_ALL_LOCAL; - } else if (NULL == mycpuset || NULL == cpusets[i] || - 0 == strcmp(cpusets[i], "UNBOUND")) { - /* all we can say is that it shares our node */ - u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { - /* we have it, so compute the locality */ - u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]); + val = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + &pname, &val, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != val) { + u16 = opal_hwloc_compute_relative_locality(mycpuset, val); + } else { + /* all we can say is that it shares our node */ + u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + } } + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCALITY); + kv->type = OPAL_UINT16; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, - "%s ess:pmi:locality: proc %s locality %x", + "%s ess:pmi:locality: proc %s locality %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pname), u16)); + ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(u16))); kv->data.uint16 = u16; ret = opal_pmix.store_local(&pname, kv); if (OPAL_SUCCESS != ret) { error = "local store of locality"; opal_argv_free(peers); - opal_argv_free(cpusets); + if (NULL != mycpuset) { + free(mycpuset); + } goto error; } OBJ_RELEASE(kv); } opal_argv_free(peers); - opal_argv_free(cpusets); + if (NULL != mycpuset) { + free(mycpuset); + } } /* now that we have all required info, complete the setup */ diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c old mode 100755 new mode 100644 diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index 7502221ba76..c343b4a17ea 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -38,6 +38,7 @@ #include "opal/util/argv.h" #include "opal/util/output.h" #include "opal/util/error.h" +#include "opal/mca/hwloc/base/base.h" #include "opal/mca/pmix/pmix.h" #include "orte/util/name_fns.h" @@ -59,7 +60,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) opal_value_t *kv; orte_node_t *node, *mynode; opal_vpid_t vpid; - char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist; + char **list, **procs, **micro, *tmp, *regex; orte_job_t *dmns; orte_job_map_t *map; orte_app_context_t *app; @@ -239,13 +240,22 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.uint32 = jdata->total_slots_alloc; opal_list_append(info, &kv->super); + /* topology signature */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_TOPOLOGY_SIGNATURE); + kv->type = OPAL_STRING; + kv->data.string = strdup(orte_topo_signature); + opal_list_append(info, &kv->super); + /* register any local clients */ vpid = ORTE_VPID_MAX; + micro = NULL; for (i=0; i < mynode->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) { continue; } if (pptr->name.jobid == jdata->jobid) { + opal_argv_append_nosize(µ, ORTE_VPID_PRINT(pptr->name.vpid)); if (pptr->name.vpid < vpid) { vpid = pptr->name.vpid; } @@ -256,6 +266,16 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) } } } + if (NULL != micro) { + /* pass the local peers */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); + kv->type = OPAL_STRING; + kv->data.string = opal_argv_join(micro, ','); + opal_argv_free(micro); + opal_list_append(info, &kv->super); + } + /* pass the local ldr */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCALLDR); @@ -274,71 +294,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { continue; } - /* construct the list of local peers, while adding - * each proc's locality info */ - list = NULL; - procs = NULL; - cpulist = NULL; - peerlist = NULL; - vpid = ORTE_VPID_MAX; - for (i=0; i < node->procs->size; i++) { - if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (pptr->name.jobid == jdata->jobid) { - opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid)); - if (pptr->name.vpid < vpid) { - vpid = pptr->name.vpid; - } - /* note that we have to pass the cpuset for each local - * peer so locality can be computed */ - tmp = NULL; - if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) { - if (NULL != tmp) { - opal_argv_append_nosize(&procs, tmp); - free(tmp); - } else { - opal_argv_append_nosize(&procs, "UNBOUND"); - } - } else { - opal_argv_append_nosize(&procs, "UNBOUND"); - } - } - } - /* construct the list of peers for transmission */ - if (NULL != list) { - peerlist = opal_argv_join(list, ','); - opal_argv_free(list); - list = NULL; - } - /* construct the list of cpusets for transmission */ - if (NULL != procs) { - cpulist = opal_argv_join(procs, ':'); - opal_argv_free(procs); - procs = NULL; - } - - /* if this is me, then pass the peers and cpusets to myself - * in order to maintain backward compatibility for the non-pmix - * components in OPAL/pmix */ - if (node == mynode) { - /* pass the list of peers */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); - kv->type = OPAL_STRING; - kv->data.string = strdup(peerlist); - opal_list_append(info, &kv->super); - - /* pass the list of cpusets */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS); - kv->type = OPAL_STRING; - kv->data.string = strdup(cpulist); - opal_list_append(info, &kv->super); - - } - - /* now cycle across each proc on this node, passing all data that + /* cycle across each proc on this node, passing all data that * varies by proc */ for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { @@ -363,19 +319,18 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.name.vpid = pptr->name.vpid; opal_list_append(pmap, &kv->super); - /* pass the list of peers */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_PEERS); - kv->type = OPAL_STRING; - kv->data.string = strdup(peerlist); - opal_list_append(pmap, &kv->super); - - /* pass the list of cpusets */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS); - kv->type = OPAL_STRING; - kv->data.string = strdup(cpulist); - opal_list_append(pmap, &kv->super); + /* location, for local procs */ + if (node == mynode) { + if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) { + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCALITY_STRING); + kv->type = OPAL_STRING; + kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp); + opal_output(0, "PROC %s LOCALITY %s", ORTE_NAME_PRINT(&pptr->name), kv->data.string); + opal_list_append(pmap, &kv->super); + free(tmp); + } + } /* appnum */ kv = OBJ_NEW(opal_value_t); @@ -441,13 +396,6 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.uint32 = pptr->node->index; opal_list_append(pmap, &kv->super); } - /* cleanup */ - if (NULL != cpulist) { - free(cpulist); - } - if (NULL != peerlist) { - free(peerlist); - } } /* mark the job as registered */