diff --git a/ompi/errhandler/errhandler_predefined.c b/ompi/errhandler/errhandler_predefined.c index 8d235092b71..cd54bb6e30b 100644 --- a/ompi/errhandler/errhandler_predefined.c +++ b/ompi/errhandler/errhandler_predefined.c @@ -15,6 +15,7 @@ * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -51,7 +52,7 @@ static void out(char *str, char *arg); void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm, - int *error_code, ...) + int *error_code, ...) { char *name; struct ompi_communicator_t *abort_comm; @@ -72,7 +73,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm, void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file, - int *error_code, ...) + int *error_code, ...) { char *name; struct ompi_communicator_t *abort_comm; @@ -93,7 +94,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file, void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win, - int *error_code, ...) + int *error_code, ...) { char *name; struct ompi_communicator_t *abort_comm = NULL; @@ -111,7 +112,7 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win, } void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm, - int *error_code, ...) + int *error_code, ...) { /* Don't need anything more -- just need this function to exist */ /* Silence some compiler warnings */ @@ -123,7 +124,7 @@ void ompi_mpi_errors_return_comm_handler(struct ompi_communicator_t **comm, void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file, - int *error_code, ...) + int *error_code, ...) { /* Don't need anything more -- just need this function to exist */ /* Silence some compiler warnings */ @@ -135,7 +136,7 @@ void ompi_mpi_errors_return_file_handler(struct ompi_file_t **file, void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win, - int *error_code, ...) + int *error_code, ...) { /* Don't need anything more -- just need this function to exist */ /* Silence some compiler warnings */ @@ -181,6 +182,7 @@ static void backend_fatal_aggregate(char *type, const char* const unknown_error_code = "Error code: %d (no associated error message)"; const char* const unknown_error = "Unknown error"; const char* const unknown_prefix = "[?:?]"; + bool generated = false; // these do not own what they point to; they're // here to avoid repeating expressions such as @@ -209,6 +211,8 @@ static void backend_fatal_aggregate(char *type, err_msg = NULL; opal_output(0, "%s", "Could not write to err_msg"); opal_output(0, unknown_error_code, *error_code); + } else { + generated = true; } } } @@ -254,7 +258,9 @@ static void backend_fatal_aggregate(char *type, } free(prefix); - free(err_msg); + if (generated) { + free(err_msg); + } } /* diff --git a/ompi/mca/topo/treematch/topo_treematch_component.c b/ompi/mca/topo/treematch/topo_treematch_component.c index 221efd0a7ee..6062bf1ed31 100644 --- a/ompi/mca/topo/treematch/topo_treematch_component.c +++ b/ompi/mca/topo/treematch/topo_treematch_component.c @@ -4,6 +4,7 @@ * reserved. * Copyright (c) 2011-2015 INRIA. All rights reserved. * Copyright (c) 2011-2015 Université Bordeaux 1 + * Copyright (c) 2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -61,9 +62,6 @@ mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component = static int init_query(bool enable_progress_threads, bool enable_mpi_threads) { - if(NULL == opal_hwloc_topology) { - return OPAL_ERR_NOT_SUPPORTED; - } return OMPI_SUCCESS; } @@ -97,4 +95,3 @@ static int mca_topo_treematch_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode); return OMPI_SUCCESS; } - diff --git a/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c b/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c index 4d4f4d3f03f..8026d50b630 100644 --- a/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c +++ b/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c @@ -5,7 +5,7 @@ * reserved. * Copyright (c) 2011-2015 INRIA. All rights reserved. * Copyright (c) 2012-2015 Bordeaux Poytechnic Institute - * Copyright (c) 2015 Intel, Inc. All rights reserved + * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. * Copyright (c) 2015-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Los Alamos National Security, LLC. All rights @@ -256,7 +256,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module, /* Then, we need to know if the processes are bound */ /* We make the hypothesis that all processes are in */ /* the same state : all bound or none bound */ - assert(NULL != opal_hwloc_topology); + if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { + goto fallback; + } root_obj = hwloc_get_root_obj(opal_hwloc_topology); if (NULL == root_obj) goto fallback; @@ -873,7 +875,7 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module, if( -1 == hwloc_err) goto fallback; /* Report new binding to ORTE/OPAL */ - /* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */ + /* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */ err = hwloc_bitmap_snprintf (set_as_string,64,set); #ifdef __DEBUG__ diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index d7d7a1bf972..af031927355 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -508,16 +508,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* check for timing request - get stop time and report elapsed time if so */ OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex")); - /* if hwloc is available but didn't get setup for some - * reason, do so now - */ - if (NULL == opal_hwloc_topology) { - if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { - error = "Topology init"; - goto error; - } - } - /* Register the default errhandler callback */ errtrk.status = OPAL_ERROR; errtrk.active = true; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index d6c119f6ec5..ff3892fdb92 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -18,7 +18,7 @@ * Copyright (c) 2009-2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012 Oak Ridge National Laboratory. All rights reserved - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Bull SAS. All rights reserved. @@ -1502,13 +1502,33 @@ static uint64_t read_module_param(char *file, uint64_t value, uint64_t max) static uint64_t calculate_total_mem (void) { hwloc_obj_t machine; + int rc; + uint64_t mem, *mptr; + opal_process_name_t wildcard_rank; - machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); - if (NULL == machine) { - return 0; + /* first try to retrieve it from PMIx as it may have + * been provided */ + wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid; + wildcard_rank.vpid = OPAL_VPID_WILDCARD; + mptr = &mem; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_AVAIL_PHYS_MEMORY, + &wildcard_rank, &mptr, OPAL_UINT64); + if (OPAL_SUCCESS == rc) { + return mem; + } + + /* if not available, then ensure that the topology has been + * loaded and try to get it from there */ + if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) { + machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); + if (NULL == machine) { + return 0; + } + return machine->memory.total_memory; } - return machine->memory.total_memory; + /* couldn't find it */ + return 0; } @@ -2312,7 +2332,8 @@ static float get_ib_dev_distance(struct ibv_device *dev) float distance = 0; /* Override any distance logic so all devices are used */ - if (0 != mca_btl_openib_component.ignore_locality) { + if (0 != mca_btl_openib_component.ignore_locality || + OPAL_SUCCESS != opal_hwloc_base_get_topology()) { return distance; } diff --git a/opal/mca/btl/sm/btl_sm.c b/opal/mca/btl/sm/btl_sm.c index 4bc07d9a2d8..c4f592939b1 100644 --- a/opal/mca/btl/sm/btl_sm.c +++ b/opal/mca/btl/sm/btl_sm.c @@ -52,7 +52,7 @@ #include "opal/util/show_help.h" #include "opal/util/printf.h" #include "opal/mca/hwloc/base/base.h" -#include "opal/mca/pmix/pmix.h" +#include "opal/mca/pmix/base/base.h" #include "opal/mca/shmem/base/base.h" #include "opal/mca/shmem/shmem.h" @@ -242,7 +242,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, free(loc); } else { /* If we have hwloc support, then get accurate information */ - if (NULL != opal_hwloc_topology) { + if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) { i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); @@ -257,6 +257,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, } } /* see if we were given our location */ + loc = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING, &OPAL_PROC_MY_NAME, &loc, OPAL_STRING); if (OPAL_SUCCESS == rc) { @@ -283,8 +284,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, } } else { /* If we have hwloc support, then get accurate information */ - if (NULL != opal_hwloc_topology && num_mem_nodes > 0 && - NULL != opal_process_info.cpuset) { + if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0) { int numa=0, w; unsigned n_bound=0; hwloc_cpuset_t avail; diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index 6208ea5399d..5f10ccd560b 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -18,7 +18,7 @@ * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,6 +48,7 @@ #include "opal/util/show_help.h" #include "opal/util/printf.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/pmix/base/base.h" #include "opal/mca/shmem/base/base.h" #include "opal/mca/shmem/shmem.h" #include "opal/datatype/opal_convertor.h" @@ -232,23 +233,28 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int my_mem_node, num_mem_nodes, i, rc; mca_common_sm_mpool_resources_t *res = NULL; mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component; + char *loc, *mynuma; + opal_process_name_t wildcard_rank; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_smcuda_component.mem_node = my_mem_node = 0; mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1; - /* If we have hwloc support, then get accurate information */ - if (NULL != opal_hwloc_topology) { - i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE, 0, - OPAL_HWLOC_AVAILABLE); - - /* If we find >0 NUMA nodes, then investigate further */ - if (i > 0) { - int numa=0, w; - unsigned n_bound=0; - hwloc_cpuset_t avail; - hwloc_obj_t obj; + /* see if we were given a topology signature */ + wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid; + wildcard_rank.vpid = OPAL_VPID_WILDCARD; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_TOPOLOGY_SIGNATURE, + &wildcard_rank, &loc, OPAL_STRING); + if (OPAL_SUCCESS == rc) { + /* the number of NUMA nodes is right at the front */ + mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10); + free(loc); + } else { + /* If we have hwloc support, then get accurate information */ + if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) { + i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, + HWLOC_OBJ_NODE, 0, + OPAL_HWLOC_AVAILABLE); /* JMS This tells me how many numa nodes are *available*, but it's not how many are being used *by this job*. @@ -257,33 +263,65 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, should be improved to be how many NUMA nodes are being used *in this job*. */ mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i; + } + } + /* see if we were given our location */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCALITY_STRING, + &OPAL_PROC_MY_NAME, &loc, OPAL_STRING); + if (OPAL_SUCCESS == rc) { + if (NULL == loc) { + mca_btl_smcuda_component.mem_node = my_mem_node = -1; + } else { + /* get our NUMA location */ + mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0); + if (NULL == mynuma || + NULL != strchr(mynuma, ',') || + NULL != strchr(mynuma, '-')) { + /* we either have no idea what NUMA we are on, or we + * are on multiple NUMA nodes */ + mca_btl_smcuda_component.mem_node = my_mem_node = -1; + } else { + /* we are bound to a single NUMA node */ + my_mem_node = strtoul(mynuma, NULL, 10); + mca_btl_smcuda_component.mem_node = my_mem_node; + } + if (NULL != mynuma) { + free(mynuma); + } + free(loc); + } + } else { + /* If we have hwloc support, then get accurate information */ + if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && + num_mem_nodes > 0 && NULL != opal_process_info.cpuset) { + int numa=0, w; + unsigned n_bound=0; + hwloc_cpuset_t avail; + hwloc_obj_t obj; - /* if we are not bound, then there is nothing further to do */ - if (NULL != opal_process_info.cpuset) { - /* count the number of NUMA nodes to which we are bound */ - for (w=0; w < i; w++) { - if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, - HWLOC_OBJ_NODE, 0, w, - OPAL_HWLOC_AVAILABLE))) { - continue; - } - /* get that NUMA node's available cpus */ - avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); - /* see if we intersect */ - if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { - n_bound++; - numa = w; - } + /* count the number of NUMA nodes to which we are bound */ + for (w=0; w < i; w++) { + if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, + HWLOC_OBJ_NODE, 0, w, + OPAL_HWLOC_AVAILABLE))) { + continue; } - /* if we are located on more than one NUMA, or we didn't find - * a NUMA we are on, then not much we can do - */ - if (1 == n_bound) { - mca_btl_smcuda_component.mem_node = my_mem_node = numa; - } else { - mca_btl_smcuda_component.mem_node = my_mem_node = -1; + /* get that NUMA node's available cpus */ + avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); + /* see if we intersect */ + if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { + n_bound++; + numa = w; } } + /* if we are located on more than one NUMA, or we didn't find + * a NUMA we are on, then not much we can do + */ + if (1 == n_bound) { + mca_btl_smcuda_component.mem_node = my_mem_node = numa; + } else { + mca_btl_smcuda_component.mem_node = my_mem_node = -1; + } } } @@ -431,7 +469,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, mca_btl_smcuda_component.sm_free_list_inc, mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) - return i; + return i; mca_btl_smcuda_component.num_outstanding_frags = 0; @@ -1120,8 +1158,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl, mca_common_wait_stream_synchronize(&rget_reg); rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size, - "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag, - &done); + "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag, + &done); if (OPAL_SUCCESS != rc) { /* Out of resources can be handled by upper layers. */ if (OPAL_ERR_OUT_OF_RESOURCE != rc) { diff --git a/opal/mca/btl/usnic/btl_usnic_hwloc.c b/opal/mca/btl/usnic/btl_usnic_hwloc.c index 78ef4c3abcb..e0230f02c9c 100644 --- a/opal/mca/btl/usnic/btl_usnic_hwloc.c +++ b/opal/mca/btl/usnic/btl_usnic_hwloc.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2016 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -9,7 +10,7 @@ #include "opal_config.h" -#include "opal/mca/hwloc/hwloc.h" +#include "opal/mca/hwloc/base/base.h" #include "opal/constants.h" #if BTL_IN_OPAL @@ -191,6 +192,13 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module) opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: filtering devices by NUMA distance"); + /* ensure we have the topology */ + if (OPAL_SUCCESS !=- opal_hwloc_base_get_topology()) { + opal_output_verbose(5, USNIC_OUT, + "btl:usnic:filter_numa: not sorting devices by NUMA distance (topology not available)"); + return OPAL_SUCCESS; + } + /* Get the hwloc distance matrix for all NUMA nodes */ if (OPAL_SUCCESS != (ret = get_distance_matrix())) { return ret; diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 812435ee6d1..16bc6111d37 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -40,6 +40,7 @@ #include "opal/util/os_dirpath.h" #include "opal/util/show_help.h" #include "opal/threads/tsd.h" +#include "opal/mca/pmix/pmix.h" #include "opal/mca/hwloc/hwloc.h" #include "opal/mca/hwloc/base/base.h" @@ -240,12 +241,65 @@ static void fill_cache_line_size(void) int opal_hwloc_base_get_topology(void) { - int rc=OPAL_SUCCESS; + int rc; + opal_process_name_t wildcard_rank; + char *val = NULL; - OPAL_OUTPUT_VERBOSE((5, opal_hwloc_base_framework.framework_output, + OPAL_OUTPUT_VERBOSE((2, opal_hwloc_base_framework.framework_output, "hwloc:base:get_topology")); - if (NULL == opal_hwloc_base_topo_file) { + /* see if we already got it */ + if (NULL != opal_hwloc_topology) { + return OPAL_SUCCESS; + } + + if (NULL != opal_pmix.get) { + /* try to retrieve it from the PMIx store */ + opal_output_verbose(1, opal_hwloc_base_framework.framework_output, + "hwloc:base instantiating topology"); + wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid; + wildcard_rank.vpid = OPAL_VPID_WILDCARD; + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, OPAL_PMIX_LOCAL_TOPO, + &wildcard_rank, &val, OPAL_STRING); + } else { + rc = OPAL_ERR_NOT_SUPPORTED; + } + + if (OPAL_SUCCESS == rc && NULL != val) { + /* load the topology */ + if (0 != hwloc_topology_init(&opal_hwloc_topology)) { + free(val); + return OPAL_ERROR; + } + if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { + free(val); + hwloc_topology_destroy(opal_hwloc_topology); + return OPAL_ERROR; + } + /* since we are loading this from an external source, we have to + * explicitly set a flag so hwloc sets things up correctly + */ + if (0 != hwloc_topology_set_flags(opal_hwloc_topology, + (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | + HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | + HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { + hwloc_topology_destroy(opal_hwloc_topology); + free(val); + return OPAL_ERROR; + } + /* now load the topology */ + if (0 != hwloc_topology_load(opal_hwloc_topology)) { + hwloc_topology_destroy(opal_hwloc_topology); + free(val); + return OPAL_ERROR; + } + free(val); + /* filter the cpus thru any default cpu set */ + if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) { + hwloc_topology_destroy(opal_hwloc_topology); + return rc; + } + } else if (NULL == opal_hwloc_base_topo_file) { if (0 != hwloc_topology_init(&opal_hwloc_topology) || 0 != hwloc_topology_set_flags(opal_hwloc_topology, (HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | @@ -266,7 +320,12 @@ int opal_hwloc_base_get_topology(void) line size */ fill_cache_line_size(); - return rc; + /* get or update our local cpuset - it will get used multiple + * times, so it's more efficient to keep a global copy + */ + opal_hwloc_base_get_local_cpuset(); + + return OPAL_SUCCESS; } int opal_hwloc_base_set_topology(char *topofile) diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index 6e2c9ab2cc7..4e6f9ef0b9d 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -106,6 +106,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_LOCALITY "pmix.loc" // (uint16_t) relative locality of two procs #define OPAL_PMIX_TOPOLOGY_SIGNATURE "pmix.toposig" // (char*) topology signature string #define OPAL_PMIX_LOCALITY_STRING "pmix.locstr" // (char*) string describing a proc's location +#define OPAL_PMIX_AVAIL_PHYS_MEMORY "pmix.pmem" // (uint64_t) total available physical memory on this node #define OPAL_PMIX_NODE_LIST "pmix.nlist" // (char*) comma-delimited list of nodes running procs for the specified nspace #define OPAL_PMIX_ALLOCATED_NODELIST "pmix.alist" // (char*) comma-delimited list of all nodes in this allocation regardless of diff --git a/orte/mca/ess/base/ess_base_fns.c b/orte/mca/ess/base/ess_base_fns.c index 1458ca56f84..e4504aeb2d9 100644 --- a/orte/mca/ess/base/ess_base_fns.c +++ b/orte/mca/ess/base/ess_base_fns.c @@ -12,7 +12,7 @@ * Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -56,7 +56,8 @@ int orte_ess_base_proc_binding(void) char *error=NULL; hwloc_cpuset_t mycpus; - /* Determine if we were pre-bound or not */ + /* Determine if we were pre-bound or not - this also indicates + * that we were launched via mpirun, bound or not */ if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) { @@ -66,21 +67,49 @@ int orte_ess_base_proc_binding(void) goto error; } } + if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { + /* print out a shorthand notation to avoid pulling in the entire topology tree */ + map = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + ORTE_PROC_MY_NAME, &map, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != map) { + opal_output(0, "MCW rank %s bound to %s", + ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map); + free(map); + } else { + opal_output(0, "MCW rank %s not bound", ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid)); + } + } + return ORTE_SUCCESS; } else if (NULL != getenv(OPAL_MCA_PREFIX"orte_externally_bound")) { orte_proc_is_bound = true; + /* see if we were launched by a PMIx-enabled system */ + map = NULL; + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, + ORTE_PROC_MY_NAME, &map, OPAL_STRING); + if (OPAL_SUCCESS == ret && NULL != map) { + /* we were - no need to pull in the topology */ + if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { + opal_output(0, "MCW rank %s bound to %s", + ORTE_VPID_PRINT(ORTE_PROC_MY_NAME->vpid), map); + } + free(map); + return ORTE_SUCCESS; + } /* the topology system will pickup the binding pattern */ } + /* load the topology as we will likely need it */ + if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { + /* there is nothing we can do, so just return */ + return ORTE_SUCCESS; + } + /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - /* we were not bound at launch */ - if (NULL == opal_hwloc_topology) { - /* there is nothing we can do, so just return */ - return ORTE_SUCCESS; - } support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); @@ -257,11 +286,6 @@ int orte_ess_base_proc_binding(void) } MOVEON: - /* get or update our local cpuset - it will get used multiple - * times, so it's more efficient to keep a global copy - */ - opal_hwloc_base_get_local_cpuset(); - /* get the cpus we are bound to */ mycpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index d8780d99bc9..170804775cc 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -302,75 +302,6 @@ static int rte_init(void) } } - /* retrieve our topology */ - val = NULL; - OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO, - &wildcard_rank, &val, OPAL_STRING); - if (OPAL_SUCCESS == ret && NULL != val) { - /* load the topology */ - if (0 != hwloc_topology_init(&opal_hwloc_topology)) { - ret = OPAL_ERROR; - free(val); - error = "setting topology"; - goto error; - } - if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { - ret = OPAL_ERROR; - free(val); - hwloc_topology_destroy(opal_hwloc_topology); - error = "setting topology"; - goto error; - } - /* since we are loading this from an external source, we have to - * explicitly set a flag so hwloc sets things up correctly - */ - if (0 != hwloc_topology_set_flags(opal_hwloc_topology, - (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | - HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | - HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { - ret = OPAL_ERROR; - hwloc_topology_destroy(opal_hwloc_topology); - free(val); - error = "setting topology"; - goto error; - } - /* now load the topology */ - if (0 != hwloc_topology_load(opal_hwloc_topology)) { - ret = OPAL_ERROR; - hwloc_topology_destroy(opal_hwloc_topology); - free(val); - error = "setting topology"; - goto error; - } - free(val); - /* filter the cpus thru any default cpu set */ - if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) { - error = "filtering topology"; - goto error; - } - } else { - /* it wasn't passed down to us, so go get it */ - if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { - error = "topology discovery"; - goto error; - } - /* push it into the PMIx database in case someone - * tries to retrieve it so we avoid an attempt to - * get it again */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); - kv->type = OPAL_STRING; - if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) { - error = "topology export"; - goto error; - } - if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&wildcard_rank, kv))) { - error = "topology store"; - goto error; - } - OBJ_RELEASE(kv); - } - /* get our local peers */ if (0 < orte_process_info.num_local_peers) { /* if my local rank if too high, then that's an error */ diff --git a/orte/mca/rmaps/base/rmaps_base_map_job.c b/orte/mca/rmaps/base/rmaps_base_map_job.c index 074eb7335ec..6ee416514e1 100644 --- a/orte/mca/rmaps/base/rmaps_base_map_job.c +++ b/orte/mca/rmaps/base/rmaps_base_map_job.c @@ -190,6 +190,8 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) /* if the user specified a default binding policy via * MCA param, then we use it - this can include a directive * to overload */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps[%d] binding policy given", __LINE__); jdata->map->binding = opal_hwloc_binding_policy; } else if (1 < jdata->map->cpus_per_rank) { /* bind to cpus */ @@ -238,6 +240,26 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps[%d] binding not given - using bynuma", __LINE__); OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA); + } else { + /* we are mapping by node or some other non-object method */ + if (nprocs <= 2) { + if (opal_hwloc_use_hwthreads_as_cpus) { + /* if we are using hwthread cpus, then bind to those */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps[%d] binding not given - using byhwthread", __LINE__); + OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_HWTHREAD); + } else { + /* for performance, bind to core */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps[%d] binding not given - using bycore", __LINE__); + OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CORE); + } + } else { + /* for performance, bind to NUMA */ + opal_output_verbose(5, orte_rmaps_base_framework.framework_output, + "mca:rmaps[%d] binding not given - using bynuma", __LINE__); + OPAL_SET_DEFAULT_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NUMA); + } } } else if (nprocs <= 2) { if (opal_hwloc_use_hwthreads_as_cpus) { diff --git a/orte/orted/pmix/pmix_server_register_fns.c b/orte/orted/pmix/pmix_server_register_fns.c index 84500a34228..4e8c67b3db0 100644 --- a/orte/orted/pmix/pmix_server_register_fns.c +++ b/orte/orted/pmix/pmix_server_register_fns.c @@ -67,6 +67,7 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) uid_t uid; gid_t gid; opal_list_t *cache; + hwloc_obj_t machine; opal_output_verbose(2, orte_pmix_server_globals.output, "%s register nspace for %s", @@ -247,6 +248,16 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.string = strdup(orte_topo_signature); opal_list_append(info, &kv->super); + /* total available physical memory */ + machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL); + if (NULL != machine) { + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_AVAIL_PHYS_MEMORY); + kv->type = OPAL_UINT64; + kv->data.uint64 = machine->memory.total_memory; + opal_list_append(info, &kv->super); + } + /* register any local clients */ vpid = ORTE_VPID_MAX; micro = NULL; @@ -328,44 +339,53 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp); opal_list_append(pmap, &kv->super); free(tmp); + } else { + /* the proc is not bound */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_LOCALITY_STRING); + kv->type = OPAL_STRING; + kv->data.string = NULL; + opal_list_append(pmap, &kv->super); } } - /* appnum */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_APPNUM); - kv->type = OPAL_UINT32; - kv->data.uint32 = pptr->app_idx; - opal_list_append(pmap, &kv->super); - - /* app ldr */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_APPLDR); - kv->type = OPAL_VPID; - kv->data.name.vpid = app->first_rank; - opal_list_append(pmap, &kv->super); - - /* global/univ rank */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_GLOBAL_RANK); - kv->type = OPAL_VPID; - kv->data.name.vpid = pptr->name.vpid + jdata->offset; - opal_list_append(pmap, &kv->super); - - /* app rank */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_APP_RANK); - kv->type = OPAL_VPID; - kv->data.name.vpid = pptr->app_rank; - opal_list_append(pmap, &kv->super); - - /* app size */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_APP_SIZE); - kv->type = OPAL_UINT32; - kv->data.uint32 = app->num_procs; - opal_list_append(info, &kv->super); + if (1 < jdata->num_apps) { + /* appnum */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_APPNUM); + kv->type = OPAL_UINT32; + kv->data.uint32 = pptr->app_idx; + opal_list_append(pmap, &kv->super); + + /* app ldr */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx); + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_APPLDR); + kv->type = OPAL_VPID; + kv->data.name.vpid = app->first_rank; + opal_list_append(pmap, &kv->super); + + /* global/univ rank */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_GLOBAL_RANK); + kv->type = OPAL_VPID; + kv->data.name.vpid = pptr->name.vpid + jdata->offset; + opal_list_append(pmap, &kv->super); + + /* app rank */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_APP_RANK); + kv->type = OPAL_VPID; + kv->data.name.vpid = pptr->app_rank; + opal_list_append(pmap, &kv->super); + + /* app size */ + kv = OBJ_NEW(opal_value_t); + kv->key = strdup(OPAL_PMIX_APP_SIZE); + kv->type = OPAL_UINT32; + kv->data.uint32 = app->num_procs; + opal_list_append(info, &kv->super); + } /* local rank */ kv = OBJ_NEW(opal_value_t); @@ -381,13 +401,6 @@ int orte_pmix_server_register_nspace(orte_job_t *jdata) kv->data.uint32 = pptr->node_rank; opal_list_append(pmap, &kv->super); - /* hostname */ - kv = OBJ_NEW(opal_value_t); - kv->key = strdup(OPAL_PMIX_HOSTNAME); - kv->type = OPAL_STRING; - kv->data.string = strdup(pptr->node->name); - opal_list_append(pmap, &kv->super); - /* node ID */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_NODEID);