diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h index de4018bec22..4e5323fc046 100644 --- a/ompi/mca/coll/han/coll_han.h +++ b/ompi/mca/coll/han/coll_han.h @@ -6,6 +6,8 @@ * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. * Copyright (c) Amazon.com, Inc. or its affiliates. * All rights reserved. + * Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * @@ -47,11 +49,11 @@ /* * Today; - * . only 2 modules available for intranode (low) level + * . 3 modules available for intranode (low) level * . only 2 modules available for internode (up) level */ -#define COLL_HAN_LOW_MODULES 2 +#define COLL_HAN_LOW_MODULES 3 #define COLL_HAN_UP_MODULES 2 struct mca_coll_han_bcast_args_s { diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index ed9582d5ffe..6ce8f5a06a8 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved. + * Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,7 +45,8 @@ ompi_coll_han_components ompi_coll_han_available_components[COMPONENTS_COUNT] = { TUNED, "tuned" }, { SM, "sm" }, /* this should not be used, the collective component is gone */ { ADAPT, "adapt" }, - { HAN, "han" } + { HAN, "han" }, + { XHC, "xhc" } }; /* @@ -287,7 +290,7 @@ static int han_register(void) cs->han_bcast_low_module = 0; (void) mca_coll_han_query_module_from_mca(c, "bcast_low_module", - "low level module for bcast, currently only 0 for tuned", + "low level module for bcast, 0 tuned, 2 xhc", OPAL_INFO_LVL_9, &cs->han_bcast_low_module, &cs->han_op_module_name.bcast.han_op_low_module_name); @@ -307,7 +310,7 @@ static int han_register(void) cs->han_reduce_low_module = 0; (void) mca_coll_han_query_module_from_mca(c, "reduce_low_module", - "low level module for allreduce, currently only 0 tuned", + "low level module for allreduce, 0 tuned, 2 xhc", OPAL_INFO_LVL_9, &cs->han_reduce_low_module, &cs->han_op_module_name.reduce.han_op_low_module_name); @@ -326,7 +329,7 @@ static int han_register(void) cs->han_allreduce_low_module = 0; (void) mca_coll_han_query_module_from_mca(c, "allreduce_low_module", - "low level module for allreduce, currently only 0 tuned", + "low level module for allreduce, 0 tuned, 2 xhc", OPAL_INFO_LVL_9, &cs->han_allreduce_low_module, &cs->han_op_module_name.allreduce.han_op_low_module_name); @@ -338,7 +341,7 @@ static int han_register(void) cs->han_allgather_low_module = 0; (void) mca_coll_han_query_module_from_mca(c, "allgather_low_module", - "low level module for allgather, currently only 0 tuned", + "low level module for allgather, 0 tuned, 2 xhc", OPAL_INFO_LVL_9, &cs->han_allgather_low_module, &cs->han_op_module_name.allgather.han_op_low_module_name); @@ -350,7 +353,7 @@ static int han_register(void) cs->han_gather_low_module = 0; (void) mca_coll_han_query_module_from_mca(c, "gather_low_module", - "low level module for gather, currently only 0 tuned", + "low level module for gather, 0 tuned, 2 xhc", OPAL_INFO_LVL_9, &cs->han_gather_low_module, &cs->han_op_module_name.gather.han_op_low_module_name); @@ -374,7 +377,7 @@ static int han_register(void) cs->han_scatter_low_module = 0; (void) mca_coll_han_query_module_from_mca(c, "scatter_low_module", - "low level module for scatter, currently only 0 tuned", + "low level module for scatter, 0 tuned, 2 xhc", OPAL_INFO_LVL_9, &cs->han_scatter_low_module, &cs->han_op_module_name.scatter.han_op_low_module_name); diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h index 403e458391e..82114227308 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.h +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -5,6 +5,8 @@ * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved + * Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. * * $COPYRIGHT$ * @@ -105,6 +107,7 @@ typedef enum COMPONENTS { SM, ADAPT, HAN, + XHC, COMPONENTS_COUNT } COMPONENT_T; diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index d1330188f41..92bddb3ba51 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -3,6 +3,8 @@ * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2020 Bull S.A.S. All rights reserved. + * Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. * * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ @@ -314,6 +316,10 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, &comm_info, &(low_comms[1])); assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[1]) && !OMPI_COMM_IS_DISJOINT(low_comms[1])); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "xhc,^han"); + ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, + &comm_info, &(low_comms[2])); + /* * Upgrade libnbc module priority to set up up_comms[0] with libnbc module * This sub-communicator contains one process per node: processes with the diff --git a/ompi/mca/coll/xhc/Makefile.am b/ompi/mca/coll/xhc/Makefile.am new file mode 100644 index 00000000000..35db0b89c12 --- /dev/null +++ b/ompi/mca/coll/xhc/Makefile.am @@ -0,0 +1,44 @@ +# +# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) +# Laboratory, ICS Forth. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_opaldata_DATA = help-coll-xhc.txt + +sources = \ + coll_xhc.h \ + coll_xhc_atomic.h \ + coll_xhc.c \ + coll_xhc_component.c \ + coll_xhc_module.c \ + coll_xhc_bcast.c \ + coll_xhc_barrier.c \ + coll_xhc_reduce.c \ + coll_xhc_allreduce.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +component_noinst = +component_install = +if MCA_BUILD_ompi_coll_xhc_DSO +component_install += mca_coll_xhc.la +else +component_noinst += libmca_coll_xhc.la +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_coll_xhc_la_SOURCES = $(sources) +mca_coll_xhc_la_LDFLAGS = -module -avoid-version +mca_coll_xhc_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la + +noinst_LTLIBRARIES = $(component_noinst) +libmca_coll_xhc_la_SOURCES = $(sources) +libmca_coll_xhc_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/coll/xhc/README.md b/ompi/mca/coll/xhc/README.md new file mode 100644 index 00000000000..325170b7179 --- /dev/null +++ b/ompi/mca/coll/xhc/README.md @@ -0,0 +1,282 @@ +# XHC: XPMEM-based Hierarchical Collectives + +The XHC component, implements hierarchical & topology-aware intra-node MPI +collectives, utilizing XPMEM in order to achieve efficient shared address space +memory access between processes. + +## Main features + +* Constructs an **n-level hierarchy** (i.e. no algorithmic limitation on level +count), following the system's hardware topology. Ranks/processes are grouped +together according to their relative locations; this information is known +thanks to Hwloc, and is obtained via OpenMPI's integrated book-keeping. + + Topological features that can currently be defined (configurable via MCA params): + + - NUMA node + - CPU Socket + - L1, L2, L3 cache + - Hwthread, core + - Node/flat (no hierarchy) + + Example of a 3-level XHC hierarchy (sensitivity to numa & socket locality): + + ![Example of 3-level XHC hierarchy](resources/xhc-hierarchy.svg) + + Furthermore, support for custom virtual user-defined hierarchies is + available, to aid when fine-grained control over the communication pattern + is necessary. + +* Support for both **zero-copy** and **copy-in-copy-out** data transportation. + - Switchover at static but configurable message size. + + - CICO buffers permanently attached at module initialization + + - Application buffers attached on the fly the first time they appear, saved + on and recovered from registration cache in subsequent appearances. + (assuming smsc/xpmem) + +* Integration with Open MPI's `opal/smsc` (shared-memory-single-copy) +framework. Selection of `smsc/xpmem` is highly recommended. + + - Bcast support: XPMEM, CMA, KNEM + - Allreduce support: XPMEM + - Barrier support: *(all, irrelevant)* + +* Data-wise **pipelining** across all levels of the hierarchy allows for +lowering hierarchy-induced start-up overheads. Pipelining also allows for +interleaving of operations in some collectives (reduce+bcast in allreduce). + +* **Lock-free** single-writer synchronization, with cache-line separation where +necessary/beneficial. Consistency ensured via lightweight memory barriers. + +## Configuration options -- MCA params + +XHC can be customized via a number of standard Open MPI MCA parameters, though +defaults that should satisfy a wide number of systems are in place. + +The available parameters: + +#### *(prepend with "coll_xhc_")* +*(list may be outdated, please also check `ompi_info` and `coll_xhc_component.c`)* + +* **priority** (default `0`): The priority of the coll/xhc component, used +during the component selection process. + +* **print_info** (default `false`): Print information about XHC's generated +hierarchy and its configuration. + +* **shmem_backing** (default `/dev/shm`): Backing directory for shmem files +used for XHC's synchronization fields and CICO buffers. + +* **dynamic_leader** (default `false`): Enables the feature that dynamically +elects an XHC group leader at each collective (currently only applicable +to bcast). + +* **dynamic_reduce** (default `1`=`non-float`): Controls the +feature that allows for out-of-order reduction. XHC ranks reduce chunks +directly from multiple peers' buffers; dynamic reduction allows them to +temporarily skip a peer when the expected data is not yet prepared, instead of +stalling. Setting to `2`=`all`, might/will harm reproducibility of float-based +reductions. + +* **coll_xhc_lb_reduce_leader_assist** (default `top,first`): Controls the +leader-to-member load balancing mode in reductions. If set to none/empty (`""`) +only non-leader group members perform reductions. With `top` in the list, the +leader of the top-most level also performs reductions in his group. With +`first` in the list, leaders will help in the reduction workload for just one +chunk at the beginning of the operation. If `all` is specified, all group +members, including the leaders, perform reductions indiscriminately. + +* **force_reduce** (default `false`): Force enable the "special" Reduce +implementation for all calls to MPI_Reduce. This implementation assumes that +the `rbuf` parameter to MPI_Reduce is valid and appropriately sized for all +ranks; not just the root -- you have to make sure that this is indeed the case +with the application at hand. Only works with `root = 0`. + +* **hierarchy** (default `"numa,socket"`): A comma separated list of +topological feature to which XHC's hierarchy-building algorithm should be +sensitive. `ompi_info` reports the possible values for the parameter. + + - In some ways, this is "just" a suggestion. The resulting hierarchy may + not exactly match the requested one. Reasons that this will occur: + + - A requested topological feature does not effectively segment the set + of ranks. (eg. `numa` was specified, but all ranks reside in the same + NUMA node) + + - No feature that all ranks have in common was provided. This a more + intrinsic detail, that you probably don't need to be aware of, but you + might come across if eg. you investigate the output of `print_info`. An + additional level will automatically be added in this case, no need to + worry about it. + + For all intents and purposes, a hierarchy of `numa,socket` is + interpreted as "segment the ranks according to NUMA node locality, + and then further segment them according to CPU socket locality". + Three groups will be created: the intra-NUMA one, the intra-socket + one, and an intra-node one. + + - The provided features will automatically be re-ordered when their + order does not match their order in the physical system. (unless a + virtual hierarchy feature is present in the list) + + - *Virtual Hierarchies*: The string may alternatively also contain "rank + lists" which specify exactly which ranks to group together, as well as some + other special modifiers. See in + `coll_xhc_component.c:xhc_component_parse_hierarchy()` for further + explanation as well as syntax information. + +* **chunk_size** (default `16K`): The chunk size for the pipelining process. +Data is processed (eg broadcast, reduced) in this-much sized pieces at once. + + - It's possible to have a different chunk size for each level of the + hierarchy, achieved via providing a comma-separated list of sizes (eg. + `"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT* + correspond to the items on hierarchy list; the hierarchy keys might be + re-ordered or reduced to match the system, but the chunk sizes will be + consumed in the order they are given, left-to-right -> bottom-to-top. + +* **uniform_chunks** (default `true`): Automatically optimize the chunk size +in reduction collectives, according to the message size, so that all members +will perform equal work. + +* **uniform_chunks_min** (default `1K`): The lowest allowed value for the chunk +size when uniform chunks are enabled. Each worker will reduce at least this much +data, or we don't bother splitting the workload up. + +* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will be +used for messages of *cico_max* or less bytes. + +*(Removed Parameters)* + +* **rcache_max**, **rcache_max_global** *(REMOVED with shift to opal/smsc)*: +Limit to number of attachments that the registration cache should hold. + + - A case can be made about their usefulness. If desired, should be + re-implemented at smsc-level. + +## Limitations + +- *Intra-node support only* + - Usage in multi-node scenarios is possible via OpenMPI's HAN. + +- **Heterogeneity**: XHC does not support nodes with non-uniform (rank-wise) +datatype representations. (determined according to `proc_arch` field) + +- **Non-commutative** operators are not supported by XHC's reduction +collectives. In past versions, they were supported, but only with the flat +hierarchy configuration; this could make a return at some point. + +- XHC's Reduce is not fully complete. Instead, it is a "special" implementation +of MPI_Reduce, that is realized as a sub-case of XHC's Allreduce. + + - If the caller guarantees that the `rbuf` parameter is valid for all ranks + (not just the root), like in Allreduce, this special Reduce can be invoked + by specifying `root=-1`, which will trigger a Reduce to rank `0` (the only + one currently supported). + + - Current prime use-case: HAN's Allreduce + + - Furthermore, if it is guaranteed that all Reduce calls in an application + satisfy the above criteria, see about the `force_reduce` MCA parameter. + + - XHC's Reduce is not yet fully optimized for small messages. + +## Building + +XHC is built as a standard mca/coll component. + +To reap its full benefits, XPMEM support in OpenMPI is required. XHC will build +and work without it, but the reduction operations will be disabled and +broadcast will fall-back to less efficient mechanisms (CMA, KNEM). + +## Running + +In order for the XHC component to be chosen, make sure that its priority is +higher than other components that provide the collectives of interest; use the +`coll_xhc_priority` MCA parameter. If a list of collective modules is included +via the `coll` MCA parameter, make sure XHC is in the list. + +* You may also want to add the `--bind-to core` param. Otherwise, the reported +process localities might be too general, preventing XHC from correctly +segmenting the system. (`coll_xhc_print_info` will report the generated +hierarchy) + +### Tuning + +* Optional: You might wish to manually specify the topological features that +XHC's hierarchy should conform to. The default is `numa,socket`, which will +group the processes according to NUMA locality and then further group them +according to socket locality. See the `coll_xhc_hierarchy` param. + + - Example: `--mca coll_xhc_hierarchy numa,socket` + - Example: `--mca coll_xhc_hierarchy numa` + - Example: `--mca coll_xhc_hierarchy flat` + + In some systems, small-message Broadcast or the Barrier operation might + perform better with a flat tree instead of a hierarchical one. Currently, + manual benchmarking is required to accurately determine this. + +* Optional: You might wish to tune XHC's chunk size (default `16K`). Use the +`coll_xhc_chunk_size` param, and try values close to the default and see if +improvements are observed. You may even try specifying different chunk sizes +for each hierarchy level -- use the same process, starting from the same chunk +size for all levels and decreasing/increasing from there. + + - Example: `--mca coll_xhc_chunk_size 16K` + - Example: `--mca coll_xhc_chunk_size 16K,32K,128K` + +* Optional: If you wish to focus on latencies of small messages, you can try +altering the cico-to-zcopy switchover point (`coll_xhc_cico_max`, default +`1K`). + + - Example: `--mca coll_xhc_cico_max 1K` + +* Optional: If your application is heavy in Broadcast calls and you suspect +that specific ranks might be joining the collective with delay and causing +others to stall waiting for them, you could try enabling dynamic leadership +(`coll_xhc_dynamic_leader`), and seeing if it marks an improvement. + + - Example: `--mca coll_xhc_dynamic_leader 1` + +### Example command lines + +*Assuming `PATH` and `LD_LIBRARY_PATH` have been set appropriately.* + +Default XHC configuration: +`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --bind-to core ` + +XHC w/ numa-sensitive hierarchy, chunk size @ 16K: +`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core ` + +XHC with flat hierarchy (ie. none at all): +`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] ` + +## Publications + +1. **A framework for hierarchical single-copy MPI collectives on multicore nodes**, +*George Katevenis, Manolis Ploumidis, Manolis Marazakis*, +IEEE Cluster 2022, Heidelberg, Germany. +https://ieeexplore.ieee.org/document/9912729 + +## Contact + +- George Katevenis (gkatev@ics.forth.gr) +- Manolis Ploumidis (ploumid@ics.forth.gr) + +Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth + +## Acknowledgments + +We thankfully acknowledge the support of the European Commission and the Greek +General Secretariat for Research and Innovation under the EuroHPC Programme +through the **DEEP-SEA** project (GA 955606). National contributions from the +involved state members (including the Greek General Secretariat for Research +and Innovation) match the EuroHPC funding. + +This work is partly supported by project **EUPEX**, which has received funding +from the European High-Performance Computing Joint Undertaking (JU) under grant +agreement No 101033975. The JU receives support from the European Union's +Horizon 2020 re-search and innovation programme and France, Germany, Italy, +Greece, United Kingdom, Czech Republic, Croatia. diff --git a/ompi/mca/coll/xhc/coll_xhc.c b/ompi/mca/coll/xhc/coll_xhc.c new file mode 100644 index 00000000000..d7221ffb37a --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc.c @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "mpi.h" +#include "ompi/constants.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" + +#include "opal/mca/rcache/rcache.h" +#include "opal/mca/shmem/base/base.h" +#include "opal/mca/smsc/smsc.h" + +#include "opal/include/opal/align.h" +#include "opal/util/show_help.h" +#include "opal/util/minmax.h" + +#include "coll_xhc.h" + +static int xhc_comms_make(ompi_communicator_t *ompi_comm, + xhc_peer_info_t *peer_info, xhc_comm_t **comms_dst, + int *comm_count_dst, xhc_loc_t *hierarchy, int hierarchy_len); +static void xhc_comms_destroy(xhc_comm_t *comms, int comm_count); + +static int xhc_print_info(xhc_module_t *module, + ompi_communicator_t *comm, xhc_data_t *data); + +static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size, + ompi_communicator_t *ompi_comm, const char *name_chr_s, int name_chr_i); +static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds); +static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info); + +// ------------------------------------------------ + +int mca_coll_xhc_lazy_init(xhc_module_t *module, ompi_communicator_t *comm) { + + int comm_size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + + xhc_peer_info_t *peer_info = module->peer_info; + + opal_shmem_ds_t *peer_cico_ds = NULL; + xhc_data_t *data = NULL; + + xhc_coll_fns_t xhc_fns; + + int return_code = OMPI_SUCCESS; + int ret; + + errno = 0; + + // ---- + + /* XHC requires rank communication during its initialization. + * Temporarily apply the saved fallback collective modules, + * and restore XHC's after initialization is done. */ + xhc_module_install_fallback_fns(module, comm, &xhc_fns); + + // ---- + + ret = xhc_module_prepare_hierarchy(module, comm); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, end); + } + + // ---- + + data = malloc(sizeof(xhc_data_t)); + peer_cico_ds = malloc(comm_size * sizeof(opal_shmem_ds_t)); + if(!data || !peer_cico_ds) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + *data = (xhc_data_t) { + .comms = NULL, + .comm_count = -1, + + .pvt_coll_seq = 0 + }; + + // ---- + + if(OMPI_XHC_CICO_MAX > 0) { + opal_shmem_ds_t cico_ds; + + void *my_cico = xhc_shmem_create(&cico_ds, + OMPI_XHC_CICO_MAX, comm, "cico", 0); + if(!my_cico) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + /* Manually "touch" to assert allocation in local NUMA node + * (assuming linux's default firt-touch-alloc NUMA policy) */ + memset(my_cico, 0, OMPI_XHC_CICO_MAX); + + ret = comm->c_coll->coll_allgather(&cico_ds, + sizeof(opal_shmem_ds_t), MPI_BYTE, peer_cico_ds, + sizeof(opal_shmem_ds_t), MPI_BYTE, comm, + comm->c_coll->coll_allgather_module); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, end); + } + + for(int r = 0; r < comm_size; r++) { + peer_info[r].cico_ds = peer_cico_ds[r]; + } + + peer_info[rank].cico_buffer = my_cico; + } + + // ---- + + /* An XHC communicator is created for each level of the hierarchy. + * The hierachy must be in an order of most-specific to most-general. */ + + ret = xhc_comms_make(comm, peer_info, &data->comms, &data->comm_count, + module->hierarchy, module->hierarchy_len); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, end); + } + + for(int i = 0, c = 0; i < data->comm_count; i++) { + data->comms[i].chunk_size = module->chunks[c]; + c = opal_min(c + 1, module->chunks_len - 1); + } + + if(module->chunks_len < data->comm_count) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: The chunk sizes count is shorter than the " + "hierarchy size; filling in with the last entry provided"); + } else if(module->chunks_len > data->comm_count) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: The chunk size count is larger than the " + "hierarchy size; omitting last entries"); + } + + // ---- + + if(mca_coll_xhc_component.print_info) { + ret = xhc_print_info(module, comm, data); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, end); + } + } + + // ---- + + module->data = data; + module->init = true; + + end: + + xhc_module_install_fns(module, comm, xhc_fns); + + free(peer_cico_ds); + + if(return_code != 0) { + opal_show_help("help-coll-xhc.txt", "xhc-init-failed", true, + return_code, errno, strerror(errno)); + + xhc_fini(module); + } + + return return_code; +} + +void mca_coll_xhc_fini(mca_coll_xhc_module_t *module) { + if(module->data) { + xhc_data_t *data = module->data; + + if(data->comm_count >= 0) { + xhc_comms_destroy(data->comms, data->comm_count); + } + + free(data->comms); + free(data); + } + + if(module->peer_info) { + for(int r = 0; r < module->comm_size; r++) { + if(module->peer_info[r].cico_buffer) { + if(r == module->rank) { + // OMPI issue #11123 + // opal_shmem_unlink(&module->peer_info[r].cico_ds); + } + + opal_shmem_segment_detach(&module->peer_info[r].cico_ds); + } + + if(module->peer_info[r].smsc_ep) { + MCA_SMSC_CALL(return_endpoint, module->peer_info[r].smsc_ep); + } + } + } +} + +// ------------------------------------------------ + +/* This method is where the hierarchy of XHC is constructed; it receives + * the hierarchy specifications (hierarchy param) and groups ranks together + * among them. The process begins with the first locality in the list. All + * ranks that share this locality (determined via the relative peer to peer + * distances) become siblings. The one amongst them with the lowest rank + * number becomes the manager/leader of the group. The members don't really + * need to keep track of the actual ranks of their siblings -- only the rank + * of the group's leader/manager, the size of the group, and their own member + * ID. The process continues with the next locality, only that now only the + * ranks that became leaders in the previous level are eligible (determined + * via comm_candidate, see inline comments). */ +static int xhc_comms_make(ompi_communicator_t *ompi_comm, + xhc_peer_info_t *peer_info, xhc_comm_t **comms_dst, + int *comm_count_dst, xhc_loc_t *hierarchy, int hierarchy_len) { + + int ompi_rank = ompi_comm_rank(ompi_comm); + int ompi_size = ompi_comm_size(ompi_comm); + + xhc_comm_t *comms = NULL; + int comms_size = 0; + int comm_count = 0; + + opal_shmem_ds_t *comm_ctrl_ds; + bool *comm_candidate; + + size_t smsc_reg_size = 0; + + int return_code = OMPI_SUCCESS; + int ret; + + comms = malloc((comms_size = 5) * sizeof(xhc_comm_t)); + comm_ctrl_ds = malloc(ompi_size * sizeof(opal_shmem_ds_t)); + comm_candidate = malloc(ompi_size * sizeof(bool)); + + if(!comms || !comm_ctrl_ds || !comm_candidate) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) { + smsc_reg_size = mca_smsc_base_registration_data_size(); + } + + for(int h = 0; h < hierarchy_len; h++) { + xhc_comm_t *xc = &comms[comm_count]; + + if(comm_count == comms_size) { + void *tmp = realloc(comms, (comms_size *= 2) * sizeof(xhc_comm_t)); + if(!tmp) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + comms = tmp; + } + + *xc = (xhc_comm_t) { + .locality = hierarchy[h], + + .size = 0, + .manager_rank = -1, + + .member_info = NULL, + .reduce_queue = NULL, + + .comm_ctrl = NULL, + .member_ctrl = NULL, + + .ctrl_ds = (opal_shmem_ds_t) {0} + }; + + // ---- + + /* Only ranks that were leaders in the previous level are candidates + * for this one. Every rank advertises whether others may consider + * it for inclusion via an Allgather. */ + + bool is_candidate = (comm_count == 0 + || comms[comm_count - 1].manager_rank == ompi_rank); + + ret = ompi_comm->c_coll->coll_allgather(&is_candidate, 1, + MPI_C_BOOL, comm_candidate, 1, MPI_C_BOOL, + ompi_comm, ompi_comm->c_coll->coll_allgather_module); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, comm_error); + } + + for(int r = 0; r < ompi_size; r++) { + + /* If on a non-bottom comm, only managers of the previous + * comm are "full" members. However, this procedure also has + * to take place for the bottom-most comm; even if this is the + * current rank's bottom-most comm, it may not actually be so, + * for another rank (eg. with some non-symmetric hierarchies). */ + if(comm_candidate[r] == false) { + continue; + } + + // Non-local --> not part of the comm :/ + if(!PEER_IS_LOCAL(peer_info, r, xc->locality)) { + continue; + } + + /* The member ID means slightly different things whether on the + * bottom-most comm or not. On the bottom-most comm, a rank can + * either be a "full" member or not. However, on higher-up comms, + * if a rank was not a manager on the previous comm, it will not + * a "full" member. Instead, it will be a "potential" member, in + * that it keeps information about this comm, and is ready to + * take over duties and act as a normal member for a specific + * collective (eg. dynamic leader feature, or root != manager). */ + if(r == ompi_rank || (comm_count > 0 && r == comms[comm_count - 1].manager_rank)) { + xc->member_id = xc->size; + } + + // First rank to join the comm becomes the manager + if(xc->manager_rank == -1) { + xc->manager_rank = r; + } + + xc->size++; + } + + /* If there are no local peers in regards to this locality, no + * XHC comm is created for this process on this level. */ + if(xc->size <= 1) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: Locality 0x%04x does not result " + "in any new groupings; skipping it", xc->locality); + + /* All ranks must participate in the "control struct sharing" + * allgather, even if useless to this rank to some of them */ + + ret = ompi_comm->c_coll->coll_allgather(&xc->ctrl_ds, + sizeof(opal_shmem_ds_t), MPI_BYTE, comm_ctrl_ds, + sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm, + ompi_comm->c_coll->coll_allgather_module); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, comm_error); + } + + xhc_comms_destroy(xc, 1); + continue; + } + + // ---- + + /* Init comm stuff */ + + xc->member_info = calloc(xc->size, sizeof(xhc_member_info_t)); + if(xc->member_info == NULL) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error); + } + + xc->reduce_queue = OBJ_NEW(opal_list_t); + if(!xc->reduce_queue) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error); + } + + for(int m = 0; m < xc->size - 1; m++) { + xhc_rq_item_t *item = OBJ_NEW(xhc_rq_item_t); + if(!item) { + RETURN_WITH_ERROR(return_code, + OMPI_ERR_OUT_OF_RESOURCE, comm_error); + } + + opal_list_append(xc->reduce_queue, (opal_list_item_t *) item); + } + + // ---- + + // Create shared structs + if(ompi_rank == xc->manager_rank) { + size_t ctrl_len = sizeof(xhc_comm_ctrl_t) + smsc_reg_size + + xc->size * sizeof(xhc_member_ctrl_t); + + char *ctrl_base = xhc_shmem_create(&xc->ctrl_ds, ctrl_len, + ompi_comm, "ctrl", comm_count); + if(ctrl_base == NULL) { + RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error); + } + + /* Manually "touch" to assert allocation in local NUMA node + * (assuming linux's default firt-touch-alloc NUMA policy) */ + memset(ctrl_base, 0, ctrl_len); + + xc->comm_ctrl = (void *) ctrl_base; + xc->member_ctrl = (void *) (ctrl_base + + sizeof(xhc_comm_ctrl_t) + smsc_reg_size); + } + + /* The comm's managers share the details of the communication structs + * with their children, so that they may attach to them. Because + * there's not any MPI communicator formed that includes (only) the + * members of the XHC comm, the sharing is achieved with a single + * Allgather, instead of a Broadcast inside each XHC comm. */ + + ret = ompi_comm->c_coll->coll_allgather(&xc->ctrl_ds, + sizeof(opal_shmem_ds_t), MPI_BYTE, comm_ctrl_ds, + sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm, + ompi_comm->c_coll->coll_allgather_module); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, comm_error); + } + + // Attach to manager's shared structs + if(ompi_rank != xc->manager_rank) { + xc->ctrl_ds = comm_ctrl_ds[xc->manager_rank]; + + char *ctrl_base = xhc_shmem_attach(&xc->ctrl_ds); + if(ctrl_base == NULL) { + RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error); + } + + xc->comm_ctrl = (void *) ctrl_base; + xc->member_ctrl = (void *) (ctrl_base + + sizeof(xhc_comm_ctrl_t) + smsc_reg_size); + } + + xc->my_member_ctrl = &xc->member_ctrl[xc->member_id]; + xc->my_member_info = &xc->member_info[xc->member_id]; + + // ---- + + comm_count++; + + continue; + + comm_error: { + xhc_comms_destroy(comms, comm_count+1); + comm_count = -1; + + goto end; + } + } + + REALLOC(comms, comm_count, xhc_comm_t); + + *comms_dst = comms; + *comm_count_dst = comm_count; + + end: + + free(comm_ctrl_ds); + free(comm_candidate); + + if(return_code != OMPI_SUCCESS) { + free(comms); + } + + return return_code; +} + +static void xhc_comms_destroy(xhc_comm_t *comms, int comm_count) { + bool is_manager = true; + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + if(xc->member_id != 0) { + is_manager = false; + } + + free(xc->member_info); + + if(xc->reduce_queue) { + OPAL_LIST_RELEASE(xc->reduce_queue); + } + + if(xc->comm_ctrl) { + if(is_manager) { + // OMPI issue #11123 + // opal_shmem_unlink(&xc->ctrl_ds); + (void) is_manager; + } + + opal_shmem_segment_detach(&xc->ctrl_ds); + } + + *xc = (xhc_comm_t) {0}; + } +} + +static int xhc_print_info(xhc_module_t *module, + ompi_communicator_t *comm, xhc_data_t *data) { + + int rank = ompi_comm_rank(comm); + int ret; + + if(rank == 0) { + char *drval_str; + char *lb_rla_str; + char *un_min_str; + + switch(mca_coll_xhc_component.dynamic_reduce) { + case OMPI_XHC_DYNAMIC_REDUCE_DISABLED: + drval_str = "OFF"; break; + case OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT: + drval_str = "ON (non-float)"; break; + case OMPI_XHC_DYNAMIC_REDUCE_ALL: + drval_str = "ON (all)"; break; + default: + drval_str = "???"; + } + + switch(mca_coll_xhc_component.lb_reduce_leader_assist) { + case OMPI_XHC_LB_RLA_TOP_LEVEL: + lb_rla_str = "top level"; break; + case OMPI_XHC_LB_RLA_FIRST_CHUNK: + lb_rla_str = "first chunk"; break; + case OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK: + lb_rla_str = "top level + first chunk"; break; + case OMPI_XHC_LB_RLA_ALL: + lb_rla_str = "all"; break; + default: + lb_rla_str = "???"; + } + + ret = opal_asprintf(&un_min_str, " (min '%zu' bytes)", + mca_coll_xhc_component.uniform_chunks_min); + if(ret < 0) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + printf("------------------------------------------------\n" + "OMPI coll/xhc @ %s, priority %d\n" + " dynamic leader '%s', dynamic reduce '%s'\n" + " reduce load-balancing leader-assist '%s'\n" + " allreduce uniform chunks '%s'%s\n" + " CICO up until %zu bytes, barrier root %d\n\n" + "------------------------------------------------\n", + comm->c_name, mca_coll_xhc_component.priority, + (mca_coll_xhc_component.dynamic_leader ? "ON" : "OFF"), + drval_str, lb_rla_str, + (mca_coll_xhc_component.uniform_chunks ? "ON" : "OFF"), + (mca_coll_xhc_component.uniform_chunks ? un_min_str : ""), + mca_coll_xhc_component.cico_max, + mca_coll_xhc_component.barrier_root); + + free(un_min_str); + } + + for(int i = 0; i < data->comm_count; i++) { + char *mlist = NULL; + char *tmp; + + ret = opal_asprintf(&mlist, "%d", data->comms[i].manager_rank); + if(ret < 0) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + for(int m = 1; m < data->comms[i].size; m++) { + if(m == data->comms[i].member_id) { + if(i == 0 || data->comms[i-1].manager_rank == rank) { + ret = opal_asprintf(&tmp, "%s %d", mlist, rank); + } else { + ret = opal_asprintf(&tmp, "%s _", mlist); + } + } else { + ret = opal_asprintf(&tmp, "%s x", mlist); + } + + free(mlist); + mlist = tmp; + + if(ret < 0) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + } + + printf("XHC comm loc=0x%08x chunk_size=%zu with %d members [%s]\n", + data->comms[i].locality, data->comms[i].chunk_size, + data->comms[i].size, mlist); + + free(mlist); + } + + return OMPI_SUCCESS; +} + +// ------------------------------------------------ + +static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size, + ompi_communicator_t *ompi_comm, const char *name_chr_s, int name_chr_i) { + + char *shmem_file; + int ret; + + // xhc_shmem_seg.@..:_: + + ret = opal_asprintf(&shmem_file, "%s" OPAL_PATH_SEP "xhc_shmem_seg.%u@%s.%x.%d:%d_%s:%d", + mca_coll_xhc_component.shmem_backing, geteuid(), opal_process_info.nodename, + OPAL_PROC_MY_NAME.jobid, ompi_comm_rank(MPI_COMM_WORLD), ompi_comm_get_local_cid(ompi_comm), + name_chr_s, name_chr_i); + + if(ret < 0) { + return NULL; + } + + // Not 100% sure what this does!, copied from btl/sm + opal_pmix_register_cleanup(shmem_file, false, false, false); + + ret = opal_shmem_segment_create(seg_ds, shmem_file, size); + + free(shmem_file); + + if(ret != OPAL_SUCCESS) { + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, + ompi_coll_base_framework.framework_output, + "coll:xhc: Error: Could not create shared memory segment"); + + return NULL; + } + + void *addr = xhc_shmem_attach(seg_ds); + + if(addr == NULL) { + opal_shmem_unlink(seg_ds); + } + + return addr; +} + +static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds) { + void *addr = opal_shmem_segment_attach(seg_ds); + + if(addr == NULL) { + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, + ompi_coll_base_framework.framework_output, + "coll:xhc: Error: Could not attach to shared memory segment"); + } + + return addr; +} + +static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info) { + if(!peer_info->smsc_ep) { + peer_info->smsc_ep = MCA_SMSC_CALL(get_endpoint, &peer_info->proc->super); + + if(!peer_info->smsc_ep) { + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, + ompi_coll_base_framework.framework_output, + "coll:xhc: Error: Failed to initialize smsc endpoint"); + + return NULL; + } + } + + return peer_info->smsc_ep; +} + +// ------------------------------------------------ + +void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank) { + if(OMPI_XHC_CICO_MAX == 0) { + return NULL; + } + + if(peer_info[rank].cico_buffer == NULL) { + peer_info[rank].cico_buffer = xhc_shmem_attach(&peer_info[rank].cico_ds); + } + + return peer_info[rank].cico_buffer; +} + +int mca_coll_xhc_copy_expose_region(void *base, size_t len, xhc_copy_data_t **region_data) { + if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) { + void *data = MCA_SMSC_CALL(register_region, base, len); + + if(data == NULL) { + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, + ompi_coll_base_framework.framework_output, + "coll:xhc: Error: Failed to register memory region with smsc"); + + return -1; + } + + *region_data = data; + } + + return 0; +} + +void mca_coll_xhc_copy_region_post(void *dst, xhc_copy_data_t *region_data) { + memcpy(dst, region_data, mca_smsc_base_registration_data_size()); +} + +int mca_coll_xhc_copy_from(xhc_peer_info_t *peer_info, + void *dst, void *src, size_t size, void *access_token) { + + mca_smsc_endpoint_t *smsc_ep = xhc_smsc_ep(peer_info); + + if(smsc_ep == NULL) { + return -1; + } + + int status = MCA_SMSC_CALL(copy_from, smsc_ep, + dst, src, size, access_token); + + return (status == OPAL_SUCCESS ? 0 : -1); +} + +void mca_coll_xhc_copy_close_region(xhc_copy_data_t *region_data) { + if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) + MCA_SMSC_CALL(deregister_region, region_data); +} + +void *mca_coll_xhc_get_registration(xhc_peer_info_t *peer_info, + void *peer_vaddr, size_t size, xhc_reg_t **reg) { + + mca_smsc_endpoint_t *smsc_ep = xhc_smsc_ep(peer_info); + + if(smsc_ep == NULL) { + return NULL; + } + + /* MCA_RCACHE_FLAGS_PERSIST will cause the registration to stick around. + * Though actually, because smsc/xpmem initializes the ref count to 2, + * as a means of keeping the registration around (instead of using the + * flag), our flag here doesn't have much effect. If at some point we + * would wish to actually detach memory in some or all cases, we should + * either call the unmap method twice, or reach out to Open MPI devs and + * inquire about the ref count. */ + + void *local_ptr; + + *reg = MCA_SMSC_CALL(map_peer_region, smsc_ep, + MCA_RCACHE_FLAGS_PERSIST, peer_vaddr, size, &local_ptr); + + if(*reg == NULL) { + return NULL; + } + + return local_ptr; +} + +/* Won't actually unmap/detach, since we've set + * the "persist" flag while creating the mapping */ +void mca_coll_xhc_return_registration(xhc_reg_t *reg) { + MCA_SMSC_CALL(unmap_peer_region, reg); +} diff --git a/ompi/mca/coll/xhc/coll_xhc.h b/ompi/mca/coll/xhc/coll_xhc.h new file mode 100644 index 00000000000..0de32f03b46 --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc.h @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_XHC_EXPORT_H +#define MCA_COLL_XHC_EXPORT_H + +#include "ompi_config.h" + +#include +#include + +#include "mpi.h" + +#include "ompi/mca/mca.h" +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/communicator/communicator.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/op/op.h" + +#include "opal/mca/shmem/shmem.h" +#include "opal/mca/smsc/smsc.h" + +#include "coll_xhc_atomic.h" + +#define RETURN_WITH_ERROR(var, err, label) do {(var) = (err); goto label;} \ + while(0) + +#define OBJ_RELEASE_IF_NOT_NULL(obj) do {if((obj) != NULL) OBJ_RELEASE(obj);} while(0) + +#define REALLOC(p, s, t) do {void *_tmp = realloc(p, (s)*sizeof(t)); \ + if(_tmp) (p) = _tmp;} while(0) + +#define PEER_IS_LOCAL(peer_info, rank, loc) \ + (((peer_info)[(rank)].locality & (loc)) == (loc)) + +#define OMPI_XHC_LOC_EXT_BITS (8*(sizeof(xhc_loc_t) - sizeof(opal_hwloc_locality_t))) +#define OMPI_XHC_LOC_EXT_START (8*sizeof(opal_hwloc_locality_t)) + +// --- + +#define OMPI_XHC_ACK_WIN 0 + +// Align to CPU cache line (portable way to obtain it?) +#define OMPI_XHC_ALIGN 64 + +// Call opal_progress every this many ticks when busy-waiting +#define OMPI_XHC_OPAL_PROGRESS_CYCLE 10000 + +/* Reduction leader-member load balancing, AKA should leaders reduce data? + * Normally, non-leaders reduce and leaders propagate. But there are instances + * where leaders can/should also help with the group's reduction load. + * + * OMPI_XHC_LB_RLA_TOP_LEVEL: The top level's leader performs reductions + * on the top level as if a common member + * + * OMPI_XHC_LB_RLA_FIRST_CHUNK: Leaders reduce only a single chunk, on + * each level, at the beginning of the operation + * + * (OMPI_XHC_LB_RLA_TOP_LEVEL and OMPI_XHC_LB_RLA_FIRST_CHUNK are combinable) + * + * OMPI_XHC_LB_RLM_ALL: All leaders performs reductions exactly as if + * common members + * + * Generally, we might not want leaders reducing, as that may lead to load + * imbalance, since they will also have to reduce the comm's result(s) + * on upper levels. Unless a leader is also one on all levels! (e.g. the + * top-level leader). This leader should probably be assisting in the + * reduction; otherwise, the only thing he will be doing is checking + * and updating synchronization flags. + * + * Regarding the load balancing problem, the leaders will actually not have + * anything to do until the first chunk is reduced, so they might as well be + * made to help the other members with this first chunk. Keep in mind though, + * this might increase the memory load, and cause this first chunk to take + * slightly more time to be produced. */ +#define OMPI_XHC_LB_RLA_TOP_LEVEL 0x01 +#define OMPI_XHC_LB_RLA_FIRST_CHUNK 0x02 +#define OMPI_XHC_LB_RLA_ALL 0x80 + +enum { + OMPI_XHC_DYNAMIC_REDUCE_DISABLED, + OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT, + OMPI_XHC_DYNAMIC_REDUCE_ALL +}; + +#define OMPI_XHC_CICO_MAX (mca_coll_xhc_component.cico_max) + +/* For other configuration options and default + * values check coll_xhc_component.c */ + +// --- + +BEGIN_C_DECLS + +// ---------------------------------------- + +typedef uint32_t xhc_loc_t; +typedef void xhc_reg_t; +typedef void xhc_copy_data_t; + +typedef struct mca_coll_xhc_component_t mca_coll_xhc_component_t; +typedef struct mca_coll_xhc_module_t mca_coll_xhc_module_t; +typedef struct mca_coll_xhc_module_t xhc_module_t; + +typedef struct xhc_coll_fns_t xhc_coll_fns_t; +typedef struct xhc_peer_info_t xhc_peer_info_t; + +typedef struct xhc_data_t xhc_data_t; +typedef struct xhc_comm_t xhc_comm_t; + +typedef struct xhc_comm_ctrl_t xhc_comm_ctrl_t; +typedef struct xhc_member_ctrl_t xhc_member_ctrl_t; +typedef struct xhc_member_info_t xhc_member_info_t; + +typedef struct xhc_reduce_area_t xhc_reduce_area_t; +typedef struct xhc_reduce_queue_item_t xhc_rq_item_t; + +typedef struct xhc_rank_range_t xhc_rank_range_t; +typedef struct xhc_loc_def_t xhc_loc_def_t; + +OMPI_DECLSPEC extern mca_coll_xhc_component_t mca_coll_xhc_component; +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_xhc_module_t); +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(xhc_rq_item_t); +OMPI_DECLSPEC OBJ_CLASS_DECLARATION(xhc_loc_def_item_t); + +// ---------------------------------------- + +struct xhc_coll_fns_t { + mca_coll_base_module_allreduce_fn_t coll_allreduce; + mca_coll_base_module_t *coll_allreduce_module; + + mca_coll_base_module_barrier_fn_t coll_barrier; + mca_coll_base_module_t *coll_barrier_module; + + mca_coll_base_module_bcast_fn_t coll_bcast; + mca_coll_base_module_t *coll_bcast_module; + + mca_coll_base_module_reduce_fn_t coll_reduce; + mca_coll_base_module_t *coll_reduce_module; +}; + +struct mca_coll_xhc_component_t { + mca_coll_base_component_t super; + + int priority; + bool print_info; + + char *shmem_backing; + + bool dynamic_leader; + + int barrier_root; + + int dynamic_reduce; + int lb_reduce_leader_assist; + + bool force_reduce; + + bool uniform_chunks; + size_t uniform_chunks_min; + + size_t cico_max; + + char *hierarchy_mca; + char *chunk_size_mca; +}; + +struct mca_coll_xhc_module_t { + mca_coll_base_module_t super; + + /* pointers to functions/modules of + * previous coll components for fallback */ + xhc_coll_fns_t prev_colls; + + // copied from comm + int comm_size; + int rank; + + // list of localities to consider during grouping + char *hierarchy_string; + xhc_loc_t *hierarchy; + int hierarchy_len; + + // list of requested chunk sizes, to be applied to comms + size_t *chunks; + int chunks_len; + + // temporary (private) internal buffer, for methods like Reduce + void *rbuf; + size_t rbuf_size; + + // xhc-specific info for every other rank in the comm + xhc_peer_info_t *peer_info; + + xhc_data_t *data; + + bool init; +}; + +struct xhc_peer_info_t { + xhc_loc_t locality; + + ompi_proc_t *proc; + mca_smsc_endpoint_t *smsc_ep; + + opal_shmem_ds_t cico_ds; + void *cico_buffer; +}; + +struct xhc_data_t { + xhc_comm_t *comms; + int comm_count; + + xf_sig_t pvt_coll_seq; +}; + +struct xhc_comm_t { + xhc_loc_t locality; + size_t chunk_size; + + int size; + int manager_rank; + int member_id; + + // --- + + // Am I a leader in the current collective? + bool is_coll_leader; + + // Have handshaked with all members in the current op? (useful to leader) + bool all_joined; + + /* A reduce set defines a range/area of data to be reduced, and its + * settings. We require multiple areas, because there might be different + * circumstances: + * + * 1. Under certain load balancing policies, leaders perform reductions + * for the just one chunk, and then they don't. Thus, the worker count + * changes, and the settings have to recomputed for the next areas. + * + * 2. During the "middle" of the operation, all members continuously + * reduce data in maximum-sized pieces (according to the configured + * chunk size). But, towards the end of the operation, the remaining + * elements are less than ((workers * elem_chunk)), we have to + * recalculate `elem_chunk`, so that all workers will perform + * equal work. */ + struct xhc_reduce_area_t { + int start; // where the area begins + int len; // the size of the area + int workers; // how many processes perform reductions in the area + int stride; /* how much to advance inside the area after + * each reduction, unused for non-combo areas */ + + // local process settings + int work_begin; // where to begin the first reduction from + int work_end; // up to where to reduce + int work_chunk; // how much to reduce each time + int work_leftover; /* assigned leftover elements to include as + * part of the last reduction in the area */ + } reduce_area[3]; + int n_reduce_areas; + + struct xhc_member_info_t { + xhc_reg_t *sbuf_reg, *rbuf_reg; + void *sbuf, *rbuf; + bool init; + } *member_info; + + // Queue to keep track of individual reduction progress for different peers + opal_list_t *reduce_queue; + + // --- + + xhc_comm_ctrl_t *comm_ctrl; + xhc_member_ctrl_t *member_ctrl; + + opal_shmem_ds_t ctrl_ds; + + // --- + + xhc_member_ctrl_t *my_member_ctrl; // = &member_ctrl[member_id] + xhc_member_info_t *my_member_info; // = &member_info[member_id] +}; + +struct xhc_comm_ctrl_t { + // We want leader_seq, coll_ack, coll_seq to all lie in their own cache lines + + volatile xf_sig_t leader_seq; + + volatile xf_sig_t coll_ack __attribute__((aligned(OMPI_XHC_ALIGN))); + + volatile xf_sig_t coll_seq __attribute__((aligned(OMPI_XHC_ALIGN))); + + /* - Reason *NOT* to keep below fields in the same cache line as coll_seq: + * + * While members busy-wait on leader's coll_seq, initializing the rest of + * the fields will trigger cache-coherency-related "invalidate" and then + * "read miss" messages, for each store. + * + * - Reason to *DO* keep below fields in the same cache line as coll_seq: + * + * Members load from coll_seq, and implicitly fetch the entire cache + * line, which also contains the values of the other fields, that will + * also need to be loaded soon. + * + * (not 100% sure of my description here) + * + * Bcast seemed to perform better with the second option, so I went with + * that one. The best option might also be influenced by the ranks' order + * of entering in the operation. + */ + + // "Guarded" by members' coll_seq + volatile int leader_id; + volatile int leader_rank; + volatile int cico_id; + + void* volatile data_vaddr; + volatile xf_size_t bytes_ready; + + char access_token[]; +} __attribute__((aligned(OMPI_XHC_ALIGN))); + +struct xhc_member_ctrl_t { + volatile xf_sig_t member_ack; // written by member + + // written by member, at beginning of operation + volatile xf_sig_t member_seq __attribute__((aligned(OMPI_XHC_ALIGN))); + volatile int rank; + + void* volatile sbuf_vaddr; + void* volatile rbuf_vaddr; + volatile int cico_id; + + // reduction progress counters, written by member + volatile xf_int_t reduce_ready; + volatile xf_int_t reduce_done; +} __attribute__((aligned(OMPI_XHC_ALIGN))); + +struct xhc_reduce_queue_item_t { + opal_list_item_t super; + int member; // ID of member + int count; // current reduction progress for member + int area_id; // current reduce area +}; + +// ---------------------------------------- + +struct xhc_rank_range_t { + int start_rank, end_rank; +}; + +struct xhc_loc_def_t { + opal_list_item_t super; + + opal_hwloc_locality_t named_loc; + + xhc_rank_range_t *rank_list; + int rank_list_len; + + int split; + int max_ranks; + + bool repeat; +}; + +// ---------------------------------------- + +// coll_xhc_component.c +// -------------------- + +#define xhc_component_parse_hierarchy(...) mca_coll_xhc_component_parse_hierarchy(__VA_ARGS__) +#define xhc_component_parse_chunk_sizes(...) mca_coll_xhc_component_parse_chunk_sizes(__VA_ARGS__) + +int mca_coll_xhc_component_init_query(bool enable_progress_threads, + bool enable_mpi_threads); + +int mca_coll_xhc_component_parse_hierarchy(const char *val_str, + opal_list_t **level_defs_dst, int *nlevel_defs_dst); +int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str, + size_t **vals_dst, int *len_dst); + +// coll_xhc_module.c +// ----------------- + +#define xhc_module_install_fns(...) mca_coll_xhc_module_install_fns(__VA_ARGS__) +#define xhc_module_install_fallback_fns(...) mca_coll_xhc_module_install_fallback_fns(__VA_ARGS__) + +#define xhc_module_prepare_hierarchy(...) mca_coll_xhc_module_prepare_hierarchy(__VA_ARGS__) + +mca_coll_base_module_t *mca_coll_xhc_module_comm_query( + ompi_communicator_t *comm, int *priority); + +int mca_coll_xhc_module_enable(mca_coll_base_module_t *module, + ompi_communicator_t *comm); +int mca_coll_xhc_module_disable(mca_coll_base_module_t *module, + ompi_communicator_t *comm); + +void mca_coll_xhc_module_install_fallback_fns(xhc_module_t *module, + ompi_communicator_t *comm, xhc_coll_fns_t *prev_fns_dst); +void mca_coll_xhc_module_install_fns(xhc_module_t *module, + ompi_communicator_t *comm, xhc_coll_fns_t fns); + +int mca_coll_xhc_module_prepare_hierarchy(mca_coll_xhc_module_t *module, + ompi_communicator_t *comm); + +// coll_xhc.c +// ---------- + +#define xhc_lazy_init(...) mca_coll_xhc_lazy_init(__VA_ARGS__) +#define xhc_fini(...) mca_coll_xhc_fini(__VA_ARGS__) + +#define xhc_get_cico(...) mca_coll_xhc_get_cico(__VA_ARGS__) + +#define xhc_copy_expose_region(...) mca_coll_xhc_copy_expose_region(__VA_ARGS__) +#define xhc_copy_region_post(...) mca_coll_xhc_copy_region_post(__VA_ARGS__) +#define xhc_copy_from(...) mca_coll_xhc_copy_from(__VA_ARGS__) +#define xhc_copy_close_region(...) mca_coll_xhc_copy_close_region(__VA_ARGS__) + +#define xhc_get_registration(...) mca_coll_xhc_get_registration(__VA_ARGS__) +#define xhc_return_registration(...) mca_coll_xhc_return_registration(__VA_ARGS__) + +int mca_coll_xhc_lazy_init(mca_coll_xhc_module_t *module, ompi_communicator_t *comm); +void mca_coll_xhc_fini(mca_coll_xhc_module_t *module); + +void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank); + +int mca_coll_xhc_copy_expose_region(void *base, size_t len, xhc_copy_data_t **region_data); +void mca_coll_xhc_copy_region_post(void *dst, xhc_copy_data_t *region_data); +int mca_coll_xhc_copy_from(xhc_peer_info_t *peer_info, void *dst, + void *src, size_t size, void *access_token); +void mca_coll_xhc_copy_close_region(xhc_copy_data_t *region_data); + +void *mca_coll_xhc_get_registration(xhc_peer_info_t *peer_info, + void *peer_vaddr, size_t size, xhc_reg_t **reg); +void mca_coll_xhc_return_registration(xhc_reg_t *reg); + +// Primitives (respective file) +// ---------------------------- + +int mca_coll_xhc_bcast(void *buf, int count, ompi_datatype_t *datatype, + int root, ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_xhc_barrier(ompi_communicator_t *ompi_comm, + mca_coll_base_module_t *module); + +int mca_coll_xhc_reduce(const void *sbuf, void *rbuf, + int count, ompi_datatype_t *datatype, ompi_op_t *op, int root, + ompi_communicator_t *comm, mca_coll_base_module_t *module); + +int mca_coll_xhc_allreduce(const void *sbuf, void *rbuf, + int count, ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *comm, mca_coll_base_module_t *module); + +// Miscellaneous +// ------------- + +#define xhc_allreduce_internal(...) mca_coll_xhc_allreduce_internal(__VA_ARGS__) + +int mca_coll_xhc_allreduce_internal(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, ompi_communicator_t *ompi_comm, + mca_coll_base_module_t *module, bool require_bcast); + +// ---------------------------------------- + +// Rollover-safe check that flag has reached/exceeded thresh, with max deviation +static inline bool CHECK_FLAG(volatile xf_sig_t *flag, + xf_sig_t thresh, xf_sig_t win) { + + // This is okay because xf_sig_t is unsigned. Take care. + // The cast's necessity is dependent on the size of xf_sig_t + return ((xf_sig_t) (*flag - thresh) <= win); +} + +static inline void WAIT_FLAG(volatile xf_sig_t *flag, + xf_sig_t thresh, xf_sig_t win) { + bool ready = false; + + do { + for(int i = 0; i < OMPI_XHC_OPAL_PROGRESS_CYCLE; i++) { + if(CHECK_FLAG(flag, thresh, win)) { + ready = true; + break; + } + + /* xf_sig_t f = *flag; + if(CHECK_FLAG(&f, thresh, win)) { + ready = true; + break; + } else if(CHECK_FLAG(&f, thresh, 1000)) + printf("Debug: Flag check with window %d failed, " + "but succeeded with window 1000. flag = %d, " + "thresh = %d\n", win, f, thresh); */ + } + + if(!ready) { + opal_progress(); + } + } while(!ready); +} + +// ---------------------------------------- + +END_C_DECLS + +#endif diff --git a/ompi/mca/coll/xhc/coll_xhc_allreduce.c b/ompi/mca/coll/xhc/coll_xhc_allreduce.c new file mode 100644 index 00000000000..d45065b9dc0 --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_allreduce.c @@ -0,0 +1,1121 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mpi.h" + +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" +#include "ompi/op/op.h" + +#include "opal/mca/rcache/base/base.h" +#include "opal/util/show_help.h" +#include "opal/util/minmax.h" + +#include "coll_xhc.h" + +#define MAX_REDUCE_AREAS(comm) \ + ((int)(sizeof((comm)->reduce_area)/sizeof((comm)->reduce_area[0]))) + +OBJ_CLASS_INSTANCE(xhc_rq_item_t, opal_list_item_t, NULL, NULL); + +// ----------------------------- + +/* For the reduction areas, see comments in xhc_reduce_area_t's definition. + * For the leader reduction assistance policies see the flag definitions. */ +static void init_reduce_areas(xhc_comm_t *comms, + int comm_count, int allreduce_count, size_t dtype_size) { + + bool uniform_chunks = mca_coll_xhc_component.uniform_chunks; + int lb_rla = mca_coll_xhc_component.lb_reduce_leader_assist; + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + int avail_workers[MAX_REDUCE_AREAS(xc)]; + + for(int area_id = 0; area_id < MAX_REDUCE_AREAS(xc); area_id++) { + int workers = xc->size - 1; + + if(lb_rla & OMPI_XHC_LB_RLA_TOP_LEVEL) { + if(i == comm_count - 1 && workers < xc->size) + workers++; + } + + if(lb_rla & OMPI_XHC_LB_RLA_FIRST_CHUNK) { + if(area_id == 0 && workers < xc->size) + workers++; + } + + if(lb_rla & OMPI_XHC_LB_RLA_ALL) { + workers = xc->size; + } + + avail_workers[area_id] = workers; + } + + // Min/max work that a worker may perform (one step) + int min_elems = mca_coll_xhc_component.uniform_chunks_min / dtype_size; + int max_elems = xc->chunk_size / dtype_size; + + int area_id = 0, el_idx = 0; + + while(area_id < MAX_REDUCE_AREAS(xc) && el_idx < allreduce_count) { + xhc_reduce_area_t *area = &xc->reduce_area[area_id]; + + *area = (xhc_reduce_area_t) {0}; + + int remaining = allreduce_count - el_idx; + int workers = avail_workers[area_id]; + + int elems_per_member; + int repeat = 0; + + int area_elems = opal_min(max_elems * workers, remaining); + + /* We should consider the future size of the next area. If it's + * too small in relation to the minimum chunk (min_elems), some + * workers of the next area won't perform work, leading to load + * imbalance. In this case, we elect to either shrink the current + * area so that we will be able to better balance the load in the + * next one, or if the elements that remain for the next area are + * especially few, we make this area absorb the next one. + * Specifically, we absorb it if the increase of each worker's + * load is no more than 10% of the maximum load set. */ + if(uniform_chunks && area_id < MAX_REDUCE_AREAS(xc) - 1) { + int next_workers = avail_workers[area_id+1]; + int next_remaining = allreduce_count - (el_idx + area_elems); + + if(next_remaining < next_workers * min_elems) { + if(next_remaining/workers <= max_elems/10) { + area_elems += next_remaining; + } else { + int ideal_donate = next_workers * min_elems - next_remaining; + + /* Don't donate so much elements that this area + * won't cover its own min reduction chunk size */ + int max_donate = area_elems - workers * min_elems; + max_donate = (max_donate > 0 ? max_donate : 0); + + area_elems -= opal_min(ideal_donate, max_donate); + } + } + } + + if(uniform_chunks) { + /* The elements might not be enough for every worker to do + * work. We calculate how many workers we need so that no + * one of them does less than min_elems work, and use the + * result to calculate the final elements per member. */ + workers = opal_min(area_elems/min_elems, workers); + workers = opal_max(workers, 1); + + elems_per_member = area_elems / workers; + } else { + elems_per_member = max_elems; + workers = area_elems/max_elems; + } + + // If this is the middle area, try to maximize its size + if(area_id == 1 && workers > 0) { + int set = workers * elems_per_member; + repeat = (int)((remaining-area_elems)/set); + area_elems += repeat * set; + } + + area->start = el_idx; + area->len = area_elems; + area->workers = workers; + area->stride = workers * elems_per_member; + + /* My ID, assuming that if some member is not reducing, it is + * the one with ID=0, because currently only member 0 becomes + * the leader, and the leader is the only one that might not + * be reducing. */ + int worker_id = xc->member_id - (xc->size - avail_workers[area_id]); + + area->work_begin = el_idx + worker_id * elems_per_member; + area->work_chunk = (worker_id >= 0 && worker_id < workers ? + elems_per_member : 0); + + area->work_leftover = 0; + + int leftover_elems = (workers > 0 ? + (area_elems % (workers * elems_per_member)) : area_elems); + if(leftover_elems) { + if(worker_id == (uniform_chunks ? workers - 1 : workers)) { + area->work_leftover = leftover_elems; + } + } + + area->work_end = area->work_begin + (repeat * area->stride) + + area->work_chunk + area->work_leftover; + + el_idx += area_elems; + area_id++; + } + + assert(el_idx == allreduce_count); + + xc->n_reduce_areas = area_id; + + // Erase zero-work areas + while(xc->n_reduce_areas > 0 + && xc->reduce_area[xc->n_reduce_areas - 1].work_chunk == 0 + && xc->reduce_area[xc->n_reduce_areas - 1].work_leftover == 0) { + xc->n_reduce_areas--; + } + + /* If not a leader on this comm, nothing + * to do on next ones whatsoever */ + if(!xc->is_coll_leader) { + break; + } + } +} + +static void xhc_allreduce_init_local(xhc_comm_t *comms, int comm_count, + int allreduce_count, size_t dtype_size, xf_sig_t seq) { + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + xc->is_coll_leader = false; + + for(int m = 0; m < xc->size; m++) { + xc->member_info[m] = (xhc_member_info_t) {0}; + } + + xc->all_joined = false; + } + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + /* The manager is the leader. Even in the dynamic reduce case, + * there (currently) shouldn't be any real benefit from the + * leader being dynamic in allreduce. */ + if(xc->member_id != 0) { + break; + } + + xc->comm_ctrl->leader_seq = seq; + xc->is_coll_leader = true; + } + + init_reduce_areas(comms, comm_count, allreduce_count, dtype_size); + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + int initial_count = (xc->n_reduce_areas > 0 ? + xc->reduce_area[0].work_begin : allreduce_count); + + int m = 0; + OPAL_LIST_FOREACH_DECL(item, xc->reduce_queue, xhc_rq_item_t) { + if(m == xc->member_id) { + m++; + } + + *item = (xhc_rq_item_t) {.super = item->super, .member = m++, + .count = initial_count, .area_id = 0}; + } + + if(!xc->is_coll_leader) { + break; + } + } +} + +static void xhc_allreduce_init_comm(xhc_comm_t *comms, int comm_count, + void *rbuf, bool do_cico, int ompi_rank, xf_sig_t seq) { + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + if(!xc->is_coll_leader) { + break; + } + + WAIT_FLAG(&xc->comm_ctrl->coll_ack, seq - 1, 0); + + /* Because there is a control dependency with the load + * from coll_ack above and the code below, and because + * it is a load-store one (not load-load), I declare + * that a read-memory-barrier is not required here. */ + + xc->comm_ctrl->leader_id = xc->member_id; + xc->comm_ctrl->leader_rank = ompi_rank; + xc->comm_ctrl->data_vaddr = (!do_cico ? rbuf : NULL); + xc->comm_ctrl->bytes_ready = 0; + + xhc_atomic_wmb(); + + xc->comm_ctrl->coll_seq = seq; + } +} + +static void xhc_allreduce_init_member(xhc_comm_t *comms, int comm_count, + xhc_peer_info_t *peer_info, void *sbuf, void *rbuf, int allreduce_count, + bool do_cico, int ompi_rank, xf_sig_t seq) { + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + /* Essentially the value of reduce area-0's + * work_begin, as set in init_local() */ + int rq_first_count = ((xhc_rq_item_t *) + opal_list_get_first(xc->reduce_queue))->count; + + /* Make sure that the previous owner of my member ctrl (tip: can + * occur with dynamic leadership (or non-zero root!?), when it is + * implemented ^^) is not still using it. Also not that this + * previous owner will set member_ack only after the comm's coll_ack + * is set, so it also guarantees that no other member in the comm is + * accessing the member's flags from a previous collective. */ + WAIT_FLAG(&xc->my_member_ctrl->member_ack, seq - 1, 0); + + xc->my_member_ctrl->reduce_done = rq_first_count; + xc->my_member_ctrl->reduce_ready = (i == 0 && !do_cico ? allreduce_count : 0); + + xc->my_member_ctrl->rank = ompi_rank; + + if(!do_cico) { + xc->my_member_ctrl->sbuf_vaddr = (i == 0 ? sbuf : rbuf); + xc->my_member_ctrl->rbuf_vaddr = (xc->is_coll_leader ? rbuf : NULL); + + xc->my_member_ctrl->cico_id = -1; + + xc->my_member_info->sbuf = (i == 0 ? sbuf : rbuf); + xc->my_member_info->rbuf = rbuf; + } else { + xc->my_member_ctrl->sbuf_vaddr = NULL; + xc->my_member_ctrl->rbuf_vaddr = NULL; + + int cico_id = (i == 0 ? ompi_rank : comms[i-1].manager_rank); + xc->my_member_ctrl->cico_id = cico_id; + + xc->my_member_info->sbuf = xhc_get_cico(peer_info, cico_id); + xc->my_member_info->rbuf = xhc_get_cico(peer_info, ompi_rank); + } + + xhc_atomic_wmb(); + xc->my_member_ctrl->member_seq = seq; + + if(!xc->is_coll_leader) { + break; + } + } +} + +// ----------------------------- + +static int xhc_allreduce_attach_member(xhc_comm_t *xc, int member, + xhc_peer_info_t *peer_info, size_t bytes, bool do_cico, xf_sig_t seq) { + + if(xc->member_info[member].init) { + return 0; + } + + if(!do_cico) { + int member_rank = xc->member_ctrl[member].rank; + + void *sbuf_vaddr = xc->member_ctrl[member].sbuf_vaddr; + void *rbuf_vaddr = xc->member_ctrl[member].rbuf_vaddr; + + xc->member_info[member].sbuf = xhc_get_registration( + &peer_info[member_rank], sbuf_vaddr, bytes, + &xc->member_info[member].sbuf_reg); + + if(xc->member_info[member].sbuf == NULL) { + return -1; + } + + // Leaders will also share their rbuf + if(rbuf_vaddr) { + if(rbuf_vaddr != sbuf_vaddr) { + xc->member_info[member].rbuf = xhc_get_registration( + &peer_info[member_rank], rbuf_vaddr, bytes, + &xc->member_info[member].rbuf_reg); + + if(xc->member_info[member].rbuf == NULL) { + return -1; + } + } else + xc->member_info[member].rbuf = xc->member_info[member].sbuf; + } + } else { + /* Here's the deal with CICO buffers and the comm's manager: In order + * to avoid excessive amounts of attachments, ranks that are + * foreign to a comm only attach to the comm's manager's CICO buffer, + * instead of to every member's. Therefore, members will place their + * final data in the manager's CICO buffer, instead of the leader's + * (even though the leader and the manager actually very often are one + * and the same..). */ + + xc->member_info[member].sbuf = xhc_get_cico(peer_info, + xc->member_ctrl[member].cico_id); + + if(CHECK_FLAG(&xc->comm_ctrl->coll_seq, seq, 0) + && member == xc->comm_ctrl->leader_id) { + xc->member_info[member].rbuf = xhc_get_cico(peer_info, xc->manager_rank); + } + } + + xc->member_info[member].init = true; + + return 0; +} + +static void xhc_allreduce_leader_check_all_joined(xhc_comm_t *xc, xf_sig_t seq) { + for(int m = 0; m < xc->size; m++) { + if(m == xc->member_id) { + continue; + } + + if(!CHECK_FLAG(&xc->member_ctrl[m].member_seq, seq, 0)) { + return; + } + } + + xc->all_joined = true; +} + +static void xhc_allreduce_disconnect_peers(xhc_comm_t *comms, int comm_count) { + xhc_comm_t *xc = comms; + + while(xc && xc->is_coll_leader) { + xc = (xc != &comms[comm_count-1] ? xc + 1 : NULL); + } + + if(xc == NULL) { + return; + } + + xhc_reg_t *reg; + + for(int m = 0; m < xc->size; m++) { + if(m == xc->member_id) { + continue; + } + + if((reg = xc->member_info[m].sbuf_reg)) { + xhc_return_registration(reg); + } + + if((reg = xc->member_info[m].rbuf_reg)) { + xhc_return_registration(reg); + } + } +} + +// ----------------------------- + +static xhc_comm_t *xhc_allreduce_bcast_src_comm(xhc_comm_t *comms, int comm_count) { + xhc_comm_t *s = NULL; + + for(int i = 0; i < comm_count; i++) { + if(!comms[i].is_coll_leader) { + s = &comms[i]; + break; + } + } + + return s; +} + +static void xhc_allreduce_do_ack(xhc_comm_t *comms, int comm_count, xf_sig_t seq) { + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + xc->my_member_ctrl->member_ack = seq; + + if(!xc->is_coll_leader) { + break; + } + + for(int m = 0; m < xc->size; m++) { + if(m == xc->member_id) { + continue; + } + + WAIT_FLAG(&xc->member_ctrl[m].member_ack, seq, OMPI_XHC_ACK_WIN); + } + + xc->comm_ctrl->coll_ack = seq; + } +} + +// ----------------------------- + +static void xhc_allreduce_cico_publish(xhc_comm_t *xc, void *data_src, + xhc_peer_info_t *peer_info, int ompi_rank, int allreduce_count, + size_t dtype_size) { + + int ready = xc->my_member_ctrl->reduce_ready; + + /* The chunk size here is just a means of pipelining the CICO + * publishing, for whichever case this might be necessary in. + * There isn't really any reason to consult reduce areas and + * their chunk sizes here.*/ + int elements = opal_min(xc->chunk_size/dtype_size, allreduce_count - ready); + + void *src = (char *) data_src + ready * dtype_size; + void *dst = (char *) xhc_get_cico(peer_info, ompi_rank) + ready * dtype_size; + + memcpy(dst, src, elements * dtype_size); + xhc_atomic_wmb(); + + volatile xf_int_t *rrp = &xc->my_member_ctrl->reduce_ready; + xhc_atomic_store_int(rrp, ready + elements); +} + +static int xhc_allreduce_reduce_get_next(xhc_comm_t *xc, + xhc_peer_info_t *peer_info, int allreduce_count, + size_t dtype_size, bool do_cico, bool out_of_order_reduce, + xf_sig_t seq, xhc_rq_item_t **item_dst) { + + xhc_rq_item_t *member_item = NULL; + int stalled_member = xc->size; + + /* Iterate the reduce queue, to determine which member's data to reduce, + * and from what index. The reduction queue aids in the implementation of + * the rationale that members that are not ready at some point should be + * temporarily skipped, to prevent stalling in the collective. Reasons + * that a member may not be "ready" are (1) it has not yet joined the + * collective, (2) the necessary data have not yet been produced (eg. + * because the member's children have not finished their reduction on the + * previous communicator) or have not been copied to the CICO buffer. + * However, when floating point data is concerned, skipping members and + * therefore doing certain reductions in non-deterministic order results + * to reproducibility problems. Hence the existence of the "dynamic reduce" + * switch; when enabled, members are skipped when not ready. When disabled, + * members are skipped, but only the data of members with a lower ID that + * the one that has stalled can be reduced (eg. member 2 has stalled, but + * reduction for future chunks of members 0 and 1 (only, not of member 3, + * even if it is ready) will begin instead of completely stalling). The + * reduction queue is sorted according to the reduction progress counter in + * each entry. This helps ensure fully reduced chunks are generated as soon + * as possible, so that leaders can quickly propagate them upwards. */ + OPAL_LIST_FOREACH_DECL(item, xc->reduce_queue, xhc_rq_item_t) { + int member = item->member; + + if(!xc->member_info[member].init + && CHECK_FLAG(&xc->member_ctrl[member].member_seq, seq, 0)) { + + xhc_atomic_rmb(); + + int ret = xhc_allreduce_attach_member(xc, member, peer_info, + allreduce_count * dtype_size, do_cico, seq); + + if(ret != 0) { + return ret; + } + } + + if(xc->member_info[member].init && item->count < allreduce_count) { + xhc_reduce_area_t *area = &xc->reduce_area[item->area_id]; + int elements = area->work_chunk; + + if(item->count + elements + area->work_leftover == area->work_end) { + elements += area->work_leftover; + } + + int self_ready = xc->my_member_ctrl->reduce_ready; + + volatile xf_int_t *rrp = &xc->member_ctrl[member].reduce_ready; + int member_ready = xhc_atomic_load_int(rrp); + + if(self_ready >= item->count + elements + && member_ready >= item->count + elements + && member < stalled_member) { + + member_item = item; + break; + } + } + + if(!out_of_order_reduce) { + stalled_member = opal_min(stalled_member, member); + } + } + + if(member_item) { + opal_list_remove_item(xc->reduce_queue, (opal_list_item_t *) member_item); + } + + *item_dst = member_item; + + return 0; +} + +static void xhc_allreduce_rq_item_analyze(xhc_comm_t *xc, xhc_rq_item_t *item, + bool *first_reduction, bool *last_reduction) { + + *first_reduction = false; + *last_reduction = false; + + if(opal_list_get_size(xc->reduce_queue) == 0) { + *first_reduction = true; + *last_reduction = true; + } else { + xhc_rq_item_t *first_item = (xhc_rq_item_t *) + opal_list_get_first(xc->reduce_queue); + + xhc_rq_item_t *last_item = (xhc_rq_item_t *) + opal_list_get_last(xc->reduce_queue); + + /* If this count is equal or larger than the last one, it means that + * no other count in the queue is larger than it. Therefore, this is the + * first reduction taking place for the "member_item->count" chunk idx. */ + if(item->count >= last_item->count) { + *first_reduction = true; + } + + /* If this count is uniquely minimum in the queue, this is the + * last reduction taking place for this specific chunk index. */ + if(item->count < first_item->count) { + *last_reduction = true; + } + } +} + +static void xhc_allreduce_do_reduce(xhc_comm_t *xc, xhc_rq_item_t *member_item, + int allreduce_count, ompi_datatype_t *dtype, size_t dtype_size, + ompi_op_t *op) { + + xhc_reduce_area_t *area = &xc->reduce_area[member_item->area_id]; + int elements = area->work_chunk; + + if(member_item->count + elements + area->work_leftover == area->work_end) { + elements += area->work_leftover; + } + + size_t offset = member_item->count * dtype_size; + + char *src = (char *) xc->member_info[member_item->member].sbuf + offset; + + char *dst; + char *src2 = NULL; + + bool first_reduction, last_reduction; + + xhc_allreduce_rq_item_analyze(xc, member_item, + &first_reduction, &last_reduction); + + /* Only access comm_ctrl when it's the last reduction. Otherwise, + * it's not guaranteed that the leader will have initialized it yet.*/ + if(last_reduction) { + dst = (char *) xc->member_info[xc->comm_ctrl->leader_id].rbuf + offset; + } else { + dst = (char *) xc->my_member_info->rbuf + offset; + } + + if(first_reduction) { + src2 = (char *) xc->my_member_info->sbuf + offset; + } else if(last_reduction) { + src2 = (char *) xc->my_member_info->rbuf + offset; + } + + // Happens under certain circumstances with MPI_IN_PLACE or with CICO + if(src2 == dst) { + src2 = NULL; + } else if(src == dst) { + src = src2; + src2 = NULL; + } + + xhc_atomic_rmb(); + + if(src2) { + ompi_3buff_op_reduce(op, src2, src, dst, elements, dtype); + } else { + ompi_op_reduce(op, src, dst, elements, dtype); + } + + /* If we reached the end of the area after this reduction, switch + * to the next one, or mark completion if it was the last one. + * Otherwise, adjust the count according to the area's parameters. */ + if(member_item->count + elements == area->work_end) { + if(member_item->area_id < xc->n_reduce_areas - 1) { + member_item->area_id++; + member_item->count = xc->reduce_area[member_item->area_id].work_begin; + } else { + member_item->count = allreduce_count; + } + } else { + member_item->count += area->stride; + } +} + +static void xhc_allreduce_reduce_return_item(xhc_comm_t *xc, + xhc_rq_item_t *member_item) { + + bool placed = false; + + xhc_rq_item_t *item; + OPAL_LIST_FOREACH_REV(item, xc->reduce_queue, xhc_rq_item_t) { + if(member_item->count >= item->count) { + opal_list_insert_pos(xc->reduce_queue, + (opal_list_item_t *) item->super.opal_list_next, + (opal_list_item_t *) member_item); + + placed = true; + break; + } + } + + if(!placed) { + opal_list_prepend(xc->reduce_queue, (opal_list_item_t *) member_item); + } + + xhc_rq_item_t *first_item = (xhc_rq_item_t *) + opal_list_get_first(xc->reduce_queue); + + if(first_item->count > xc->my_member_ctrl->reduce_done) { + xhc_atomic_wmb(); + + volatile xf_int_t *rdp = &xc->my_member_ctrl->reduce_done; + xhc_atomic_store_int(rdp, first_item->count); + } +} + +static void xhc_allreduce_do_bcast(xhc_comm_t *comms, int comm_count, + xhc_comm_t *src_comm, size_t bytes_total, size_t *bcast_done, + const void *bcast_src, void *bcast_dst, void *bcast_cico) { + + size_t copy_size = opal_min(src_comm->chunk_size, bytes_total - *bcast_done); + + volatile xf_size_t *brp = &src_comm->comm_ctrl->bytes_ready; + + if(xhc_atomic_load_size_t(brp) - *bcast_done >= copy_size) { + void *src = (char *) bcast_src + *bcast_done; + void *dst = (char *) bcast_dst + *bcast_done; + void *cico_dst = (char *) bcast_cico + *bcast_done; + + xhc_atomic_rmb(); + + if(bcast_cico && comms[0].is_coll_leader) { + memcpy(cico_dst, src, copy_size); + } else { + memcpy(dst, src, copy_size); + } + + *bcast_done += copy_size; + + xhc_atomic_wmb(); + + for(int i = 0; i < comm_count; i++) { + if(!comms[i].is_coll_leader) { + break; + } + + volatile xf_size_t *brp_d = &comms[i].comm_ctrl->bytes_ready; + xhc_atomic_store_size_t(brp_d, *bcast_done); + } + + if(bcast_cico && comms[0].is_coll_leader) { + memcpy(dst, cico_dst, copy_size); + } + } +} + +// ----------------------------- + +int mca_coll_xhc_allreduce_internal(const void *sbuf, void *rbuf, int count, + ompi_datatype_t *datatype, ompi_op_t *op, ompi_communicator_t *ompi_comm, + mca_coll_base_module_t *ompi_module, bool require_bcast) { + + xhc_module_t *module = (xhc_module_t *) ompi_module; + + if(!module->init) { + int ret = xhc_lazy_init(module, ompi_comm); + if(ret != OMPI_SUCCESS) { + return ret; + } + } + + if(!ompi_datatype_is_predefined(datatype)) { + static bool warn_shown = false; + + if(!warn_shown) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: XHC does not currently support " + "derived datatypes; utilizing fallback component"); + warn_shown = true; + } + + xhc_coll_fns_t fallback = module->prev_colls; + + if(require_bcast) { + return fallback.coll_allreduce(sbuf, rbuf, count, datatype, + op, ompi_comm, fallback.coll_allreduce_module); + } else { + return fallback.coll_reduce(sbuf, rbuf, count, datatype, + op, 0, ompi_comm, fallback.coll_reduce_module); + } + } + + if(!ompi_op_is_commute(op)) { + static bool warn_shown = false; + + if(!warn_shown) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: (all)reduce does not support non-commutative " + "operators; utilizing fallback component"); + warn_shown = true; + } + + xhc_coll_fns_t fallback = module->prev_colls; + + if(require_bcast) { + return fallback.coll_allreduce(sbuf, rbuf, count, datatype, + op, ompi_comm, fallback.coll_allreduce_module); + } else { + return fallback.coll_reduce(sbuf, rbuf, count, datatype, + op, 0, ompi_comm, fallback.coll_reduce_module); + } + } + + // ---- + + xhc_peer_info_t *peer_info = module->peer_info; + xhc_data_t *data = module->data; + + xhc_comm_t *comms = data->comms; + int comm_count = data->comm_count; + + size_t dtype_size, bytes_total; + ompi_datatype_type_size(datatype, &dtype_size); + bytes_total = count * dtype_size; + + bool do_cico = (bytes_total <= OMPI_XHC_CICO_MAX); + bool out_of_order_reduce = false; + + int rank = ompi_comm_rank(ompi_comm); + + // ---- + + switch(mca_coll_xhc_component.dynamic_reduce) { + case OMPI_XHC_DYNAMIC_REDUCE_DISABLED: + out_of_order_reduce = false; + break; + + case OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT: + out_of_order_reduce = !(datatype->super.flags & OMPI_DATATYPE_FLAG_DATA_FLOAT); + break; + + case OMPI_XHC_DYNAMIC_REDUCE_ALL: + out_of_order_reduce = true; + break; + } + + // ---- + + // rbuf won't be present for non-root ranks in MPI_Reduce + if(rbuf == NULL && !do_cico) { + if(module->rbuf_size < bytes_total) { + void *tmp = realloc(module->rbuf, bytes_total); + + if(tmp != NULL) { + module->rbuf = tmp; + module->rbuf_size = bytes_total; + } else { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + rbuf = module->rbuf; + } + + // ---- + + xf_sig_t pvt_seq = ++data->pvt_coll_seq; + + if(sbuf == MPI_IN_PLACE) { + sbuf = rbuf; + } + + xhc_allreduce_init_local(comms, comm_count, count, dtype_size, pvt_seq); + xhc_allreduce_init_comm(comms, comm_count, rbuf, do_cico, rank, pvt_seq); + xhc_allreduce_init_member(comms, comm_count, peer_info, + (void *) sbuf, rbuf, count, do_cico, rank, pvt_seq); + + void *local_cico = xhc_get_cico(peer_info, comms[0].manager_rank); + + // My conscience is clear! + if(require_bcast) { + goto _allreduce; + } else { + goto _reduce; + } + +// ============================================================================= + +_allreduce: { + + xhc_comm_t *bcast_comm = + xhc_allreduce_bcast_src_comm(comms, comm_count); + + bool bcast_leader_joined = false; + + for(size_t bytes_done = 0; bytes_done < bytes_total; ) { + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + xhc_comm_t *xnc = (i < comm_count - 1 ? &comms[i+1] : NULL); + + if(do_cico && i == 0 && xc->my_member_ctrl->reduce_ready < count) { + xhc_allreduce_cico_publish(xc, (void *) sbuf, + peer_info, rank, count, dtype_size); + } + + if(xc->is_coll_leader) { + int completed = 0; + + if(!xc->all_joined) { + xhc_allreduce_leader_check_all_joined(xc, pvt_seq); + } + + if(xc->all_joined) { + completed = count; + + for(int m = 0; m < xc->size; m++) { + volatile xf_int_t *rdp = &xc->member_ctrl[m].reduce_done; + int member_done = xhc_atomic_load_int(rdp); + + /* Watch out for double evaluation here, don't perform + * sensitive loads inside opal_min()'s parameter list. */ + completed = opal_min(completed, member_done); + } + } + + if(xnc && completed > xnc->my_member_ctrl->reduce_ready) { + volatile xf_int_t *rrp = &xnc->my_member_ctrl->reduce_ready; + xhc_atomic_store_int(rrp, completed); + } else if(!xnc) { + size_t bytes_fully_reduced = completed * dtype_size; + + // Broadcast fully reduced data + if(bytes_fully_reduced > bytes_done) { + for(int k = 0; k < comm_count; k++) { + volatile xf_size_t *brp = + &comms[k].comm_ctrl->bytes_ready; + xhc_atomic_store_size_t(brp, bytes_fully_reduced); + } + + if(do_cico) { + void *src = (char *) local_cico + bytes_done; + void *dst = (char *) rbuf + bytes_done; + memcpy(dst, src, bytes_fully_reduced - bytes_done); + } + + bytes_done = bytes_fully_reduced; + } + } + } + + // Is the reduction phase completed? + if(xc->my_member_ctrl->reduce_done < count) { + xhc_rq_item_t *member_item = NULL; + + int ret = xhc_allreduce_reduce_get_next(xc, + peer_info, count, dtype_size, do_cico, + out_of_order_reduce, pvt_seq, &member_item); + + if(ret != 0) { + return OMPI_ERROR; + } + + if(member_item) { + xhc_allreduce_do_reduce(xc, member_item, + count, datatype, dtype_size, op); + + xhc_allreduce_reduce_return_item(xc, member_item); + } + } + + /* If not a leader in this comm, not + * participating in higher-up ones. */ + if(!xc->is_coll_leader) { + break; + } + } + + if(bcast_comm && !bcast_leader_joined) { + if(CHECK_FLAG(&bcast_comm->comm_ctrl->coll_seq, pvt_seq, 0)) { + xhc_atomic_rmb(); + + int leader = bcast_comm->comm_ctrl->leader_id; + + if(!bcast_comm->member_info[leader].init) { + WAIT_FLAG(&bcast_comm->member_ctrl[leader].member_seq, + pvt_seq, 0); + + xhc_atomic_rmb(); + + xhc_allreduce_attach_member(bcast_comm, leader, + peer_info, bytes_total, do_cico, pvt_seq); + } + + bcast_leader_joined = true; + } + } + + if(bcast_comm && bcast_leader_joined) { + int leader = bcast_comm->comm_ctrl->leader_id; + + xhc_allreduce_do_bcast(comms, comm_count, + bcast_comm, bytes_total, &bytes_done, + bcast_comm->member_info[leader].rbuf, + rbuf, (do_cico ? local_cico : NULL)); + } + } + + xhc_allreduce_do_ack(comms, comm_count, pvt_seq); + + goto _finish; +} + +// ============================================================================= + +_reduce: { + + size_t cico_copied = 0; + int completed_comms = 0; + + while(completed_comms < comm_count) { + for(int i = completed_comms; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + xhc_comm_t *xnc = (i < comm_count - 1 ? &comms[i+1] : NULL); + + if(do_cico && i == 0 && xc->my_member_ctrl->reduce_ready < count) { + xhc_allreduce_cico_publish(xc, (void *) sbuf, + peer_info, rank, count, dtype_size); + } + + if(xc->is_coll_leader) { + int completed = 0; + + if(!xc->all_joined) { + xhc_allreduce_leader_check_all_joined(xc, pvt_seq); + } + + if(xc->all_joined) { + completed = count; + + for(int m = 0; m < xc->size; m++) { + volatile xf_int_t *rdp = &xc->member_ctrl[m].reduce_done; + int member_done = xhc_atomic_load_int(rdp); + + /* Watch out for double evaluation here, don't perform + * sensitive loads inside opal_min()'s parameter list. */ + completed = opal_min(completed, member_done); + } + } + + if(xnc && completed > xnc->my_member_ctrl->reduce_ready) { + volatile xf_int_t *rrp = &xnc->my_member_ctrl->reduce_ready; + xhc_atomic_store_int(rrp, completed); + } else if(!xnc) { + size_t completed_bytes = completed * dtype_size; + + if(do_cico && completed_bytes > cico_copied) { + void *src = (char *) local_cico + cico_copied; + void *dst = (char *) rbuf + cico_copied; + + memcpy(dst, src, completed_bytes - cico_copied); + cico_copied = completed_bytes; + } + } + + if(completed >= count) { + xc->comm_ctrl->coll_ack = pvt_seq; + completed_comms++; + } + } + + // Is the reduction phase completed? + if(xc->my_member_ctrl->reduce_done < count) { + xhc_rq_item_t *member_item = NULL; + + int ret = xhc_allreduce_reduce_get_next(xc, + peer_info, count, dtype_size, do_cico, + out_of_order_reduce, pvt_seq, &member_item); + + if(ret != 0) { + return OMPI_ERROR; + } + + if(member_item) { + xhc_allreduce_do_reduce(xc, member_item, + count, datatype, dtype_size, op); + + xhc_allreduce_reduce_return_item(xc, member_item); + } + } + + if(!xc->is_coll_leader) { + /* If all reduction-related tasks are done, and + * not a leader on the next comm, can exit */ + if(xc->my_member_ctrl->reduce_done >= count + && xc->my_member_ctrl->reduce_ready >= count) { + goto _reduce_done; + } + + /* Not a leader in this comm, so not + * participating in higher-up ones. */ + break; + } + } + } + + _reduce_done: + + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + /* Wait for the leader to give the signal that reduction + * has finished on this comm and members are free to exit */ + if(!xc->is_coll_leader) { + WAIT_FLAG(&xc->comm_ctrl->coll_ack, pvt_seq, OMPI_XHC_ACK_WIN); + } + + // load-store control dependency with coll_ack; no need for barrier + xc->my_member_ctrl->member_ack = pvt_seq; + + if(!xc->is_coll_leader) { + break; + } + } + + goto _finish; +} + +// ============================================================================= + +_finish: + + if(!do_cico) { + xhc_allreduce_disconnect_peers(comms, comm_count); + } + + return OMPI_SUCCESS; +} + +int mca_coll_xhc_allreduce(const void *sbuf, void *rbuf, + int count, ompi_datatype_t *datatype, ompi_op_t *op, + ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) { + + return xhc_allreduce_internal(sbuf, rbuf, + count, datatype, op, ompi_comm, ompi_module, true); +} diff --git a/ompi/mca/coll/xhc/coll_xhc_atomic.h b/ompi/mca/coll/xhc/coll_xhc_atomic.h new file mode 100644 index 00000000000..79f1dce98cb --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_atomic.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_COLL_XHC_ATOMIC_EXPORT_H +#define MCA_COLL_XHC_ATOMIC_EXPORT_H + +#include +#include "opal/sys/atomic.h" + +// ---------------------------------------- + +#define IS_SIG_ATOMIC_X_BITS(x) \ + (SIG_ATOMIC_MAX == INT ## x ## _MAX) || (SIG_ATOMIC_MAX == UINT ## x ## _MAX) + +// ---------------------------------------- + +// If xf_sig_t is ever re-defined to be signed, + // CHECK_FLAGS()'s comparisons must be adjusted +#if IS_SIG_ATOMIC_X_BITS(64) + typedef uint64_t xf_sig_t; +#elif IS_SIG_ATOMIC_X_BITS(32) + typedef uint32_t xf_sig_t; +#elif IS_SIG_ATOMIC_X_BITS(16) + typedef uint16_t xf_sig_t; +#elif IS_SIG_ATOMIC_X_BITS(8) + typedef uint8_t xf_sig_t; +#endif + +typedef int __attribute__((aligned(SIZEOF_INT))) xf_int_t; +typedef size_t __attribute__((aligned(SIZEOF_SIZE_T))) xf_size_t; + +// ---------------------------------------- + +#define xhc_atomic_rmb opal_atomic_rmb +#define xhc_atomic_wmb opal_atomic_wmb +#define xhc_atomic_fmb opal_atomic_mb + +// https://github.com/open-mpi/ompi/issues/9722 + +#if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS + #define xhc_atomic_load_int(addr) __atomic_load_n(addr, __ATOMIC_RELAXED) + #define xhc_atomic_store_int(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED) + + #define xhc_atomic_load_size_t(addr) __atomic_load_n(addr, __ATOMIC_RELAXED) + #define xhc_atomic_store_size_t(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED) +#else + #define xhc_atomic_load_int(addr) (*(addr)) + #define xhc_atomic_store_int(addr, val) (*(addr) = (val)) + + #define xhc_atomic_load_size_t(addr) (*(addr)) + #define xhc_atomic_store_size_t(addr, val) (*(addr) = (val)) + + #warning "GCC or the C11 atomics backend was not found. XHC might not function correctly" +/* #else + #error "XHC atomics do not yet work without the GCC or the C11 backend" */ +#endif + + +// If/when opal atomic load/store size_t is added + +/* #define xhc_atomic_load_size_t(addr) \ + opal_atomic_load_size_t ((opal_atomic_size_t *) addr) +#define xhc_atomic_store_size_t(addr, val) \ + opal_atomic_store_size_t ((opal_atomic_size_t *) addr, val) */ + + +// If/when opal atomic load/store is added, and if opal atomic load/store int is not + +/* #if SIZEOF_INT == 4 + #define xhc_atomic_load_int(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr) + #define xhc_atomic_store_int(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val) +#elif SIZEOF_INT == 8 + #define xhc_atomic_load_int(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr) + #define xhc_atomic_store_int(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val) +#else + #error "Unsupported int size" +#endif */ + + +// If/when opal atomic load/store is added, and if opal atomic load/store size_t is not + +/* #if SIZEOF_SIZE_T == 4 + #define xhc_atomic_load_size_t(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr) + #define xhc_atomic_store_size_t(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val) +#elif SIZEOF_SIZE_T == 8 + #define xhc_atomic_load_size_t(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr) + #define xhc_atomic_store_size_t(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val) +#else + #error "Unsupported size_t size" +#endif */ + +static inline bool xhc_atomic_cmpxchg_strong_relaxed(volatile xf_sig_t *addr, + xf_sig_t *oldval, xf_sig_t newval) { + + #if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS + return __atomic_compare_exchange_n(addr, oldval, newval, + false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + #else + #if IS_SIG_ATOMIC_X_BITS(32) + return opal_atomic_compare_exchange_strong_32(addr, oldval, newval); + #elif IS_SIG_ATOMIC_X_BITS(64) + return opal_atomic_compare_exchange_strong_64(addr, oldval, newval); + #else + #error "Unsupported sig_atomic_t size" + #endif + #endif +} + +#endif diff --git a/ompi/mca/coll/xhc/coll_xhc_barrier.c b/ompi/mca/coll/xhc/coll_xhc_barrier.c new file mode 100644 index 00000000000..ade1300134a --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_barrier.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mpi.h" + +#include "ompi/constants.h" +#include "ompi/communicator/communicator.h" + +#include "coll_xhc.h" + +static void xhc_barrier_leader(xhc_comm_t *comms, int comm_count, + xhc_peer_info_t *peer_info, int rank, int root, xf_sig_t seq) { + + // Non-leader by default + for(int i = 0; i < comm_count; i++) { + comms[i].is_coll_leader = false; + } + + for(int i = 0; i < comm_count; i++) { + // I'm the root and therefore always a leader + if(rank == root) { + comms[i].comm_ctrl->leader_seq = seq; + comms[i].is_coll_leader = true; + + continue; + } + + // The root takes leadership precedence when local + if(PEER_IS_LOCAL(peer_info, root, comms[i].locality)) { + break; + } + + // The member with the lowest ID (ie. the manager) becomes the leader + if(comms[i].member_id == 0) { + comms[i].comm_ctrl->leader_seq = seq; + comms[i].is_coll_leader = true; + } + + // Non-leaders exit; they can't become leaders on higher levels + if(comms[i].is_coll_leader == false) { + break; + } + } +} + +/* Hierarchical Barrier with seq/ack flags + * --------------------------------------- + * 1. Ranks write their coll_seq field to signal they have joined + * the collective. Leaders propagate this information towards + * the top-most comm's leader using the same method. + * + * 2. The top-most comm's leader (root) sets the comm's coll_ack + * field to signal, that all ranks have joined the barrier. + * + * 3. Leaders propagate the info towards the bottom-most comm, using + * the same method. Ranks wait on thei coll_ack flag, set their + * own ack, and exit the collective. + * --------------------------------------- */ +int mca_coll_xhc_barrier(ompi_communicator_t *ompi_comm, + mca_coll_base_module_t *ompi_module) { + + xhc_module_t *module = (xhc_module_t *) ompi_module; + + if(!module->init) { + int ret = xhc_lazy_init(module, ompi_comm); + if(ret != OMPI_SUCCESS) return ret; + } + + xhc_peer_info_t *peer_info = module->peer_info; + xhc_data_t *data = module->data; + + xhc_comm_t *comms = data->comms; + int comm_count = data->comm_count; + + int rank = ompi_comm_rank(ompi_comm); + + xf_sig_t pvt_seq = ++data->pvt_coll_seq; + + xhc_barrier_leader(comms, comm_count, peer_info, rank, + mca_coll_xhc_component.barrier_root, pvt_seq); + + // 1. Upwards SEQ Wave + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + xc->my_member_ctrl->member_seq = pvt_seq; + + if(!xc->is_coll_leader) { + break; + } + + for(int m = 0; m < xc->size; m++) { + if(m == xc->member_id) { + continue; + } + + /* Poll comm members and wait for them to join the barrier. + * No need for windowed comparison here; Ranks won't exit the + * barrier before the leader has set the coll_seq flag. */ + WAIT_FLAG(&xc->member_ctrl[m].member_seq, pvt_seq, 0); + } + } + + // 2. Wait for ACK (root won't wait!) + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + if(xc->is_coll_leader == false) { + WAIT_FLAG(&xc->comm_ctrl->coll_ack, pvt_seq, 0); + break; + } + } + + // 3. Trigger ACK Wave + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + /* Not actually necessary for the barrier operation, but + * good for consistency between all seq/ack numbers */ + xc->my_member_ctrl->member_ack = pvt_seq; + + if(!xc->is_coll_leader) { + break; + } + + xc->comm_ctrl->coll_ack = pvt_seq; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/xhc/coll_xhc_bcast.c b/ompi/mca/coll/xhc/coll_xhc_bcast.c new file mode 100644 index 00000000000..f0b99983e50 --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_bcast.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mpi.h" + +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" +#include "opal/util/show_help.h" +#include "opal/util/minmax.h" + +#include "coll_xhc.h" + +/* When dynamic leadership is enabled, the first rank of each + * xhc comm to join the collective will become its leader */ +static void xhc_bcast_try_leader(xhc_comm_t *comms, int comm_count, + xhc_peer_info_t *peer_info, int rank, int root, xf_sig_t seq) { + + // Non-leader by default + for(int i = 0; i < comm_count; i++) { + comms[i].is_coll_leader = false; + } + + for(int i = 0; i < comm_count; i++) { + // I'm the root and therefore always a leader + if(rank == root) { + comms[i].comm_ctrl->leader_seq = seq; + comms[i].is_coll_leader = true; + + continue; + } + + // The root takes leadership precedence when local + if(PEER_IS_LOCAL(peer_info, root, comms[i].locality)) { + break; + } + + if(mca_coll_xhc_component.dynamic_leader == false) { + /* If dynamic leadership is disabled, the member with + * the lowest ID (ie. the manager) becomes the leader */ + if(comms[i].member_id == 0) { + comms[i].comm_ctrl->leader_seq = seq; + comms[i].is_coll_leader = true; + } + } else { + // An opportunity exists to become the leader + if(comms[i].comm_ctrl->leader_seq != seq) { + xf_sig_t oldval = seq - 1; + + comms[i].is_coll_leader = xhc_atomic_cmpxchg_strong_relaxed( + &comms[i].comm_ctrl->leader_seq, &oldval, seq); + } + } + + // Non-leaders exit; they can't become leaders on higher levels + if(comms[i].is_coll_leader == false) { + break; + } + } + + /* The writes and the cmpxchg to comm_ctrl->leader_seq, are relaxed. + * They do not synchronize access to any other data, and it's not a + * problem if some closeby loads/stores are reordered with it. The + * only purpose of leader_seq is to determine if a rank will be leader + * or not. Only the result of the cmp operation is utilized. */ +} + +static void xhc_bcast_children_init(xhc_comm_t *comms, int comm_count, + void *buffer, size_t bytes_ready, xhc_copy_data_t *region_data, + bool do_cico, int rank, xf_sig_t seq) { + + for(int i = comm_count - 1; i >= 0; i--) { + xhc_comm_t *xc = &comms[i]; + + if(!xc->is_coll_leader) { + continue; + } + + WAIT_FLAG(&xc->comm_ctrl->coll_ack, seq - 1, 0); + + /* Because there is a control dependency with the loads + * from coll_ack above and the code below, and because it + * is a load-store one (not load-load), I declare that a + * read-memory-barrier is not required here. */ + + xc->comm_ctrl->leader_id = xc->member_id; + xc->comm_ctrl->leader_rank = rank; + + xc->comm_ctrl->cico_id = (do_cico ? comms[0].manager_rank : -1); + + xc->comm_ctrl->data_vaddr = (!do_cico ? buffer : NULL); + xc->comm_ctrl->bytes_ready = bytes_ready; + + if(region_data != NULL) { + xhc_copy_region_post(xc->comm_ctrl->access_token, region_data); + } + + /* The above comm_ctrl stores must have finished before the + * peers are notified to attach/copy. We don't need an atomic + * store to bytes_ready here, since it is guarded by coll_seq. */ + xhc_atomic_wmb(); + + xc->comm_ctrl->coll_seq = seq; + } +} + +static void xhc_bcast_children_set_bytes_ready(xhc_comm_t *comms, + int comm_count, size_t bytes) { + + for(int i = comm_count - 1; i >= 0; i--) { + xhc_comm_t *xc = &comms[i]; + + if(!xc->is_coll_leader) { + continue; + } + + volatile xf_size_t *brp = &xc->comm_ctrl->bytes_ready; + xhc_atomic_store_size_t(brp, bytes); + } + + /* Not much reason for a wmb() here or inside the loop. + * The stores may be reordered after any following stores, + * and within themselves. */ +} + +static void xhc_bcast_do_ack(xhc_comm_t *comms, + int comm_count, xf_sig_t seq) { + + // Set Ack(s) + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + xc->my_member_ctrl->member_ack = seq; + + if(!xc->is_coll_leader) { + break; + } + } + + // Gather members' Ack(s) and set coll_ack + for(int i = 0; i < comm_count; i++) { + xhc_comm_t *xc = &comms[i]; + + if(!xc->is_coll_leader) { + break; + } + + for(int m = 0; m < xc->size; m++) { + if(m == xc->member_id) { + continue; + } + + WAIT_FLAG(&xc->member_ctrl[m].member_ack, seq, OMPI_XHC_ACK_WIN); + } + + xc->comm_ctrl->coll_ack = seq; + } +} + +static xhc_comm_t *xhc_bcast_src_comm(xhc_comm_t *comms, int comm_count) { + xhc_comm_t *s = NULL; + + for(int i = 0; i < comm_count; i++) { + if(!comms[i].is_coll_leader) { + s = &comms[i]; + break; + } + } + + return s; +} + +int mca_coll_xhc_bcast(void *buf, int count, ompi_datatype_t *datatype, int root, + ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) { + + xhc_module_t *module = (xhc_module_t *) ompi_module; + + if(!module->init) { + int ret = xhc_lazy_init(module, ompi_comm); + if(ret != OMPI_SUCCESS) return ret; + } + + if(!ompi_datatype_is_predefined(datatype)) { + static bool warn_shown = false; + + if(!warn_shown) { + opal_output_verbose(MCA_BASE_VERBOSE_WARN, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: XHC does not currently support " + "derived datatypes; utilizing fallback component"); + warn_shown = true; + } + + xhc_coll_fns_t fallback = ((xhc_module_t *) module)->prev_colls; + return fallback.coll_bcast(buf, count, datatype, root, + ompi_comm, fallback.coll_bcast_module); + } + + // ---- + + xhc_peer_info_t *peer_info = module->peer_info; + xhc_data_t *data = module->data; + + xhc_comm_t *comms = data->comms; + int comm_count = data->comm_count; + + size_t dtype_size, bytes_total; + ompi_datatype_type_size(datatype, &dtype_size); + bytes_total = count * dtype_size; + + int rank = ompi_comm_rank(ompi_comm); + + bool do_cico = (bytes_total <= OMPI_XHC_CICO_MAX); + void *local_cico = xhc_get_cico(peer_info, comms[0].manager_rank); + void *src_buffer; + + // Only really necessary for smsc/knem + xhc_copy_data_t *region_data = NULL; + + // ---- + + xf_sig_t pvt_seq = ++data->pvt_coll_seq; + + xhc_bcast_try_leader(comms, comm_count, peer_info, rank, root, pvt_seq); + + // No chunking for now... TODO? + if(rank == root && do_cico) { + memcpy(local_cico, buf, bytes_total); + } + + if(!do_cico) { + int err = xhc_copy_expose_region(buf, bytes_total, ®ion_data); + if(err != 0) { + return OMPI_ERROR; + } + } + + xhc_bcast_children_init(comms, comm_count, buf, + (rank == root ? bytes_total : 0), region_data, do_cico, rank, pvt_seq); + + if(rank == root) { + goto coll_finish; + } + + // ---- + + /* Not actually necessary for the broadcast operation, but + * good for consistency between all seq/ack numbers */ + for(int i = 0; i < comm_count; i++) { + comms[i].my_member_ctrl->member_seq = pvt_seq; + if(!comms[i].is_coll_leader) { + break; + } + } + + xhc_comm_t *src_comm = xhc_bcast_src_comm(comms, comm_count); + xhc_comm_ctrl_t *src_ctrl = src_comm->comm_ctrl; + + WAIT_FLAG(&src_ctrl->coll_seq, pvt_seq, 0); + xhc_atomic_rmb(); + + if(!do_cico) { + src_buffer = src_ctrl->data_vaddr; + } else { + src_buffer = xhc_get_cico(peer_info, src_ctrl->cico_id); + if(src_buffer == NULL) return OMPI_ERR_OUT_OF_RESOURCE; + } + + size_t bytes_done = 0; + size_t bytes_available = 0; + + while(bytes_done < bytes_total) { + size_t copy_size = opal_min(src_comm->chunk_size, bytes_total - bytes_done); + + void *data_dst = (char *) buf + bytes_done; + void *data_src = (char *) src_buffer + bytes_done; + void *data_cico_dst = (char *) local_cico + bytes_done; + + if(bytes_available < copy_size) { + do { + volatile xf_size_t *brp = &src_ctrl->bytes_ready; + bytes_available = xhc_atomic_load_size_t(brp) - bytes_done; + } while(bytes_available < copy_size); + + // Wait on loads inside the loop + xhc_atomic_rmb(); + } + + /* Pipelining is not necessary on the bottom + * level, copy all available at once */ + if(!comms[0].is_coll_leader) { + copy_size = bytes_available; + } + + if(!do_cico) { + int err = xhc_copy_from(&peer_info[src_ctrl->leader_rank], + data_dst, data_src, copy_size, src_ctrl->access_token); + if(err != 0) { + return OMPI_ERROR; + } + } else { + memcpy((comms[0].is_coll_leader + ? data_cico_dst : data_dst), data_src, copy_size); + } + + bytes_done += copy_size; + bytes_available -= copy_size; + + /* Do make sure the memcpy has completed before + * writing to the peers' bytes_ready. */ + xhc_atomic_wmb(); + + xhc_bcast_children_set_bytes_ready(comms, comm_count, bytes_done); + + if(do_cico && comms[0].is_coll_leader) { + memcpy(data_dst, data_cico_dst, copy_size); + } + } + + if(!do_cico) { + xhc_copy_close_region(region_data); + } + + coll_finish: + + /* No wmb() necessary before sending ACK, as all operations + * that should be waited on (reads from shared buffers) have + * explicit barriers following them. */ + + xhc_bcast_do_ack(comms, comm_count, pvt_seq); + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/coll/xhc/coll_xhc_component.c b/ompi/mca/coll/xhc/coll_xhc_component.c new file mode 100644 index 00000000000..dac4fd3db2d --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_component.c @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mpi.h" + +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" + +#include "opal/mca/shmem/base/base.h" +#include "opal/util/show_help.h" + +#include "coll_xhc.h" + +typedef int (*csv_parse_conv_fn_t)(char *str, void *dst); +typedef void (*csv_parse_destruct_fn_t)(void *data); + +static int xhc_register(void); + +const char *mca_coll_xhc_component_version_string = + "Open MPI xhc collective MCA component version " OMPI_VERSION; + +static const char *hwloc_topo_str[] = { + "node", "flat", + "socket", + "numa", + "l3", "l3cache", + "l2", "l2cache", + "l1", "l1cache", + "core", + "hwthread", "thread" +}; + +static const xhc_loc_t hwloc_topo_val[] = { + OPAL_PROC_ON_NODE, OPAL_PROC_ON_NODE, + OPAL_PROC_ON_SOCKET, + OPAL_PROC_ON_NUMA, + OPAL_PROC_ON_L3CACHE, OPAL_PROC_ON_L3CACHE, + OPAL_PROC_ON_L2CACHE, OPAL_PROC_ON_L2CACHE, + OPAL_PROC_ON_L1CACHE, OPAL_PROC_ON_L1CACHE, + OPAL_PROC_ON_CORE, + OPAL_PROC_ON_HWTHREAD, OPAL_PROC_ON_HWTHREAD +}; + +mca_coll_xhc_component_t mca_coll_xhc_component = { + .super = { + .collm_version = { + MCA_COLL_BASE_VERSION_2_4_0, + + .mca_component_name = "xhc", + MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, + OMPI_MINOR_VERSION, OMPI_RELEASE_VERSION), + + .mca_register_component_params = xhc_register, + }, + + .collm_data = { + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .collm_init_query = mca_coll_xhc_component_init_query, + .collm_comm_query = mca_coll_xhc_module_comm_query, + }, + + .priority = 0, + .print_info = false, + + .shmem_backing = NULL, + + .dynamic_leader = false, + + .barrier_root = 0, + + .dynamic_reduce = OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT, + .lb_reduce_leader_assist = + (OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK), + + .force_reduce = false, + + .cico_max = 1024, + + .uniform_chunks = true, + .uniform_chunks_min = 1024, + + /* These are the parameters that will need + * processing, and their default values. */ + .hierarchy_mca = "numa,socket", + .chunk_size_mca = "16K" +}; + +/* Initial query function that is invoked during MPI_INIT, allowing + * this component to disqualify itself if it doesn't support the + * required level of thread support. */ +int mca_coll_xhc_component_init_query(bool enable_progress_threads, + bool enable_mpi_threads) { + + return OMPI_SUCCESS; +} + +static mca_base_var_enum_value_t dynamic_reduce_options[] = { + {OMPI_XHC_DYNAMIC_REDUCE_DISABLED, "disabled"}, + {OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT, "non-float"}, + {OMPI_XHC_DYNAMIC_REDUCE_ALL, "all"}, + {0, NULL} +}; + +static mca_base_var_enum_value_flag_t lb_reduce_leader_assist_options[] = { + {OMPI_XHC_LB_RLA_TOP_LEVEL, "top", OMPI_XHC_LB_RLA_ALL}, + {OMPI_XHC_LB_RLA_FIRST_CHUNK, "first", OMPI_XHC_LB_RLA_ALL}, + {OMPI_XHC_LB_RLA_ALL, "all", + (OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK)}, + {0, NULL, 0} +}; + +static int xhc_register(void) { + mca_base_var_enum_t *var_enum; + mca_base_var_enum_flag_t *var_enum_flag; + char *tmp, *desc; + int ret; + + /* Priority */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "priority", "Priority of the xhc component", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.priority); + + /* Info */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "print_info", "Print information during initialization", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.print_info); + + /* SHM Backing dir */ + + mca_coll_xhc_component.shmem_backing = (access("/dev/shm", W_OK) == 0 ? + "/dev/shm" : opal_process_info.job_session_dir); + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "shmem_backing", "Directory to place backing files for shared-memory" + " control-data communication", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_xhc_component.shmem_backing); + + /* Dynamic leader */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "dynamic_leader", "Enable dynamic operation-wise group-leader selection", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_leader); + + /* Dynamic reduce */ + + ret = mca_base_var_enum_create("coll_xhc_dynamic_reduce_options", + dynamic_reduce_options, &var_enum); + if(ret != OPAL_SUCCESS) { + return ret; + } + + /* Barrier root */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "barrier_root", "Internal root for the barrier operation (rank ID)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.barrier_root); + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "dynamic_reduce", "Dynamic/out-of-order intra-group reduction", + MCA_BASE_VAR_TYPE_INT, var_enum, 0, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_reduce); + + OBJ_RELEASE(var_enum); + + /* Load balancing: Reduce leader assistance */ + + ret = mca_base_var_enum_create_flag("coll_xhc_lb_reduce_leader_assist", + lb_reduce_leader_assist_options, &var_enum_flag); + if(ret != OPAL_SUCCESS) { + return ret; + } + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "lb_reduce_leader_assist", "Reduction leader assistance modes for load balancing", + MCA_BASE_VAR_TYPE_INT, &var_enum_flag->super, 0, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.lb_reduce_leader_assist); + + OBJ_RELEASE(var_enum_flag); + + /* Force enable "hacky" reduce */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "force_reduce", "Force enable the \"special\" Reduce for all calls", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.force_reduce); + + /* Hierarchy features */ + + desc = NULL; + + for(size_t i = 0; i < sizeof(hwloc_topo_str)/sizeof(char *); i++) { + ret = opal_asprintf(&tmp, "%s%s%s", (i > 0 ? desc : ""), + (i > 0 ? ", " : ""), hwloc_topo_str[i]); + free(desc); desc = tmp; + if(ret < 0) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + ret = opal_asprintf(&tmp, "Comma-separated list of topology features to " + "consider for the hierarchy (%s)", desc); + free(desc); desc = tmp; + if(ret < 0) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "hierarchy", desc, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.hierarchy_mca); + + free(desc); + + /* Chunk size(s) */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "chunk_size", "The chunk size(s) to be used for the pipeline " + "(single value, or comma separated list for different hierarchy levels " + "(bottom to top))", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.chunk_size_mca); + + /* Allreduce uniform chunks */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "uniform_chunks", "Automatically optimize chunk size in reduction " + "collectives according to message size, for load balancing", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.uniform_chunks); + + /* Allreduce uniform chunks min size */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "uniform_chunks_min", "Minimum chunk size for reduction collectives, " + "when \"uniform chunks\" are enabled", MCA_BASE_VAR_TYPE_SIZE_T, + NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, + &mca_coll_xhc_component.uniform_chunks_min); + + /* CICO threshold (inclusive) */ + + (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version, + "cico_max", "Maximum message size up to which to use CICO", + MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.cico_max); + + return OMPI_SUCCESS; +} + +static int parse_csv(const char *csv_orig, char sep, char ignore_start, + char ignore_end, void **vals_dst, int *len_dst, size_t type_size, + csv_parse_conv_fn_t conv_fn, csv_parse_destruct_fn_t destructor_fn, + char *err_help_header) { + + if(csv_orig == NULL || strlen(csv_orig) == 0) { + *vals_dst = NULL; + *len_dst = 0; + return OMPI_SUCCESS; + } + + char *csv = NULL; + void *vals = NULL; + + int vals_size = 0; + int ntokens = 0; + + int return_code = OMPI_SUCCESS; + + if(!(csv = strdup(csv_orig))) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + if(!(vals = malloc((vals_size = 5) * type_size))) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + int ignore_cnt = 0; + char *token = csv; + + int csv_len = strlen(csv); + + for(int i = 0; i < csv_len + 1; i++) { + char *c = csv+i; + + if(ntokens == vals_size) { + void *tmp = realloc(vals, (vals_size *= 2) * sizeof(type_size)); + if(!tmp) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + vals = tmp; + } + + if(ignore_start != 0) { + if(*c == ignore_start) { + ignore_cnt++; + } else if(*c == ignore_end) { + ignore_cnt--; + } + + if(ignore_cnt < 0) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end); + } + } + + if(ignore_cnt == 0 && (*c == sep || *c == '\0')) { + char oldc = *c; + *c = '\0'; + + int status = conv_fn(token, (char *) vals + ntokens*type_size); + + if(status != OMPI_SUCCESS) { + if(err_help_header) { + opal_show_help("help-coll-xhc.txt", + err_help_header, true, token, csv_orig); + } + + RETURN_WITH_ERROR(return_code, status, end); + } + + ntokens++; + + *c = oldc; + token = c + 1; + } + } + + *vals_dst = vals; + *len_dst = ntokens; + + end: + + free(csv); + + if(return_code != OMPI_SUCCESS) { + if(vals && destructor_fn) { + for(int i = 0; i < ntokens; i++) { + destructor_fn((char *) vals + i*type_size); + } + } + + free(vals); + } + + return return_code; +} + +static int conv_xhc_loc_def_rank_list(char *str, void *result) { + char *strs[2] = {str, NULL}; + int nums[2] = {-1, -1}; + + char *range_op_pos = NULL; + + int return_code = OMPI_SUCCESS; + + if((range_op_pos = strstr(str, ".."))) { + strs[1] = range_op_pos + 2; + *range_op_pos = '\0'; + } + + for(int i = 0; i < 2 && strs[i]; i++) { + char *endptr; + + nums[i] = strtol(strs[i], &endptr, 10); + + if(endptr[0] != '\0' || nums[i] < 0) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end); + } + } + + ((xhc_rank_range_t *) result)->start_rank = nums[0]; + ((xhc_rank_range_t *) result)->end_rank = (nums[1] != -1 ? nums[1] : nums[0]); + + end: + + if(range_op_pos) { + *range_op_pos = '.'; + } + + return return_code; +} + +static void mca_coll_xhc_loc_def_construct(xhc_loc_def_t *def) { + def->named_loc = 0; + def->rank_list = NULL; + def->rank_list_len = 0; + def->split = 0; + def->max_ranks = 0; + def->repeat = false; +} + +static void mca_coll_xhc_loc_def_destruct(xhc_loc_def_t *def) { + free(def->rank_list); +} + +OBJ_CLASS_INSTANCE(xhc_loc_def_t, opal_list_item_t, + mca_coll_xhc_loc_def_construct, mca_coll_xhc_loc_def_destruct); + +static int conv_xhc_loc_def(char *str, void *result) { + int return_code = OMPI_SUCCESS; + + char *s = strdup(str); + xhc_loc_def_t *def = OBJ_NEW(xhc_loc_def_t); + + if(!s || !def) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + /* Parse modifiers and remove them from string */ + + if(s[strlen(s) - 1] == '*') { + def->repeat = true; + s[strlen(s) - 1] = '\0'; + } + + char *colon_pos = strrchr(s, ':'); + char *qmark_pos = strrchr(s, '?'); + + if(colon_pos && qmark_pos) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end); + } else if(colon_pos || qmark_pos) { + char *numstr = (colon_pos ? colon_pos : qmark_pos); + char *endptr; + + int num = strtol(numstr + 1, &endptr, 10); + + if(endptr[0] != '\0' || num <= 0) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end); + } + + if(colon_pos) { + def->split = num; + } else { + def->max_ranks = num; + } + + *numstr = '\0'; + } + + /* Parse locality definition */ + + if(s[0] == '[') { + if(def->repeat) { // repeat only makes sense with named localities + RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end); + } + + s[strlen(s) - 1] = '\0'; + + int status = parse_csv(s+1, ',', 0, 0, (void **) &def->rank_list, + &def->rank_list_len, sizeof(xhc_rank_range_t), + conv_xhc_loc_def_rank_list, NULL, NULL); + + if(status != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, status, end); + } + } else { + bool found = false; + + for(size_t i = 0; i < sizeof(hwloc_topo_str)/sizeof(char *); i++) { + if(strcasecmp(s, hwloc_topo_str[i]) == 0) { + def->named_loc = hwloc_topo_val[i]; + found = true; + break; + } + } + + if(!found) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end); + } + } + + * (xhc_loc_def_t **) result = def; + + end: + + free(s); + + if(return_code != OMPI_SUCCESS) { + OBJ_RELEASE_IF_NOT_NULL(def); + } + + return return_code; +} + +static void destruct_xhc_loc_def(void *data) { + OBJ_RELEASE(* (xhc_loc_def_t **) data); +} + +static int conv_xhc_loc_def_combination(char *str, void *result) { + xhc_loc_def_t **defs; + int ndefs; + + int status = parse_csv(str, '+', 0, 0, (void **) &defs, + &ndefs, sizeof(xhc_loc_def_t *), conv_xhc_loc_def, + destruct_xhc_loc_def, NULL); + if(status != OMPI_SUCCESS) { + return status; + } + + opal_list_t *def_list = (opal_list_t *) result; + OBJ_CONSTRUCT(def_list, opal_list_t); + + for(int i = 0; i < ndefs; i++) { + opal_list_append(def_list, (opal_list_item_t *) defs[i]); + } + + free(defs); + + return OMPI_SUCCESS; +} + +static void destruct_xhc_loc_def_combination(void *data) { + OPAL_LIST_DESTRUCT((opal_list_t *) data); +} + +int mca_coll_xhc_component_parse_hierarchy(const char *val_str, + opal_list_t **level_defs_dst, int *nlevel_defs_dst) { + + /* The hierarchy is in a comma-separated list format. Each item in the + * list specifies how to group ranks, and each different item entails + * a grouping step. + * + * Each item in this list is a '+'-separated list. Of course, this can + * be just one item, without any delimiter, specifying the locality to + * follow for the grouping (e.g. numa, socket, etc). + * + * But, it can also be more complex (multiple '+'-separated items), used + * to describe virtual hierarchies. This allows to group different ranks + * in different ways, e.g. some ranks according to numa, then others by + * something else, etc. + * + * Each item in this '+'-separated list, can be of the following types: + * 1. A "named locality", e.g. hwloc's localities (only ones currently + * available), see hwloc_topo_str[]. + * 2. A list of ranks that should be grouped together. This is a comma- + * separated list of integers, enclosed in [] (I know, list-ception!). + * It may also contain range operators (..), to select multiple ranks + * at once (e.g. 0..3 expands to 0,1,2,3). Example: [0..15,20,22]. + * The order of the ranks does not matter. + * + * Finally, each such item may be suffixed by a special modifier: + * 1. The split modifier (:) specifies to group according to the + * locality it refers to, but to split each such group into multiple + * parts. E.g. the locality 'numa:2' will group ranks into half-numas + * group, such that for each NUMA node, half the ranks are in one + * group, and the rest are in another. + * 2. The max-ranks modifier (?) works similarly to the split modifier, + * only that it specifies that at most _n_ ranks should be placed in + * each group. If more than _n_ ranks share the locality the modifier + * refers to, multiple groups will be created for these ranks, each one + * not more than _n_ ranks in size. + * 3. The repeat modifier (*), which can be specified along with the two + * previous modifiers, allows manual control over the repetition of + * named localities. See below, under 'repetition'. + * + * Repetition: + * Named localities are repeated for all distinct rank clusters. For + * example, "numa", even though it is a single key, means to group + * all ranks that are in the same NUMA together, which will lead to + * multiple groups if multiple NUMA nodes are present. This is in + * contract to rank lists, which only create a single group, containing + * the ranks specified in it. The different items in the '+'-separated + * list are consumed in-order left-to-right, and any named localities + * are automatically repeated to apply all ranks that are not included + * in other items. When multiple named localities are present one after + * the other, the last one is repeated, unless another repetition was + * explicitly requested via the repeat modifier. + * + * Examples: + * "numa": Group according to numa locality + * "numa,socket": Group according to numa and then socket locality + * "node"/"flat": Group according to node locality -> all ranks in + * same node -> flat hierarchy i.e. none at all + * + * "numa:2,socket": Group according to numa locality but with two + * groups per NUMA, and then according to socket. + * "numa:2,numa,socket": Similar to the previous one, but this case + * will result in one of the two half-numa-leaders further becoming + * the leader of the NUMA node. + * "numa?10,socket": Group according to numa, but no more than 10 ranks + * per NUMA; create multiple groups if necessary. Then group according + * to socket. + * + * "[0..9]+[10..24]": Create 2 groups: one for the first 10 ranks, + * and another for the next 15 ones. + * "[0..39]+numa,socket": Group the first 40 ranks, and the rest + * according to numa locality. Then group according to socket. + * + * "socket+socket:2": Create at least two groups: one for all ranks + * in the first socket, and all the other ranks group them according + * to socket locality, but with two groups for each socket. + * "socket*+socket:2": Similar to the previous one, but only the last + * socket is split into two groups, all the other ranks are grouped + * according to socket locality. + * + * If the top-most locality specified does not cover all ranks, one such + * locality will automatically be added (in the hierarchy sort method). + * + * (Oh god what have I done! -Frankenstein, probably) */ + + int status = parse_csv(val_str, ',', '[', ']', (void **) level_defs_dst, + nlevel_defs_dst, sizeof(opal_list_t), conv_xhc_loc_def_combination, + destruct_xhc_loc_def_combination, "bad-hierarchy-item"); + + return status; +} + +static int conv_chunk_size(char *str, void *result) { + size_t last_idx = strlen(str) - 1; + char saved_char = str[last_idx]; + + size_t mult = 1; + + switch(str[last_idx]) { + case 'g': case 'G': + mult *= 1024; + case 'm': case 'M': + mult *= 1024; + case 'k': case 'K': + mult *= 1024; + + str[last_idx] = '\0'; + } + + bool legal = (str[0] != '\0'); + + for(char *c = str; *c; c++) { + if((*c < '0' || *c > '9') && *c != '-') { + legal = false; + break; + } + } + + if(legal) { + long long num = atoll(str) * mult; + * (size_t *) result = (size_t) (num > 0 ? num : -1); + } + + str[last_idx] = saved_char; + + return (legal ? OMPI_SUCCESS : OMPI_ERR_BAD_PARAM); +} + +int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str, + size_t **chunks_dst, int *len_dst) { + + if(val_str == NULL) { + *chunks_dst = malloc(sizeof(size_t)); + if(*chunks_dst == NULL) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + (*chunks_dst)[0] = (size_t) -1; + *len_dst = 1; + + return OMPI_SUCCESS; + } + + int status = parse_csv(val_str, ',', 0, 0, (void **) chunks_dst, len_dst, + sizeof(size_t), conv_chunk_size, NULL, "bad-chunk-size-item"); + + return status; +} diff --git a/ompi/mca/coll/xhc/coll_xhc_module.c b/ompi/mca/coll/xhc/coll_xhc_module.c new file mode 100644 index 00000000000..879e521f662 --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_module.c @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#include + +#include "mpi.h" + +#include "ompi/mca/coll/coll.h" +#include "ompi/mca/coll/base/base.h" +#include "opal/mca/smsc/smsc.h" + +#include "opal/util/arch.h" +#include "opal/util/show_help.h" +#include "opal/util/minmax.h" + +#include "coll_xhc.h" + +static int xhc_module_save_fallback_fns( + xhc_module_t *module, ompi_communicator_t *comm); + +static int xhc_module_create_hierarchy(mca_coll_xhc_module_t *module, + ompi_communicator_t *comm, opal_list_t *level_defs, int nlevel_defs, + xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst); + +static int xhc_module_sort_hierarchy(mca_coll_xhc_module_t *module, + ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst); + +// ----------------------------- + +static void xhc_module_clear(xhc_module_t *module) { + memset(&module->prev_colls, 0, sizeof(module->prev_colls)); + + module->comm_size = 0; + module->rank = -1; + + module->hierarchy_string = NULL; + module->hierarchy = NULL; + module->hierarchy_len = 0; + + module->chunks = NULL; + module->chunks_len = 0; + + module->rbuf = NULL; + module->rbuf_size = 0; + + module->peer_info = NULL; + module->data = NULL; + module->init = false; +} + +static void mca_coll_xhc_module_construct(mca_coll_xhc_module_t *module) { + xhc_module_clear(module); +} + +static void mca_coll_xhc_module_destruct(mca_coll_xhc_module_t *module) { + xhc_fini(module); + + free(module->hierarchy_string); + free(module->hierarchy); + free(module->chunks); + free(module->rbuf); + free(module->peer_info); + + xhc_module_clear(module); +} + +OBJ_CLASS_INSTANCE(mca_coll_xhc_module_t, mca_coll_base_module_t, + mca_coll_xhc_module_construct, mca_coll_xhc_module_destruct); + +// ----------------------------- + +mca_coll_base_module_t *mca_coll_xhc_module_comm_query(ompi_communicator_t *comm, + int *priority) { + + if((*priority = mca_coll_xhc_component.priority) < 0) { + return NULL; + } + + if(OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) == 1 + || ompi_group_have_remote_peers (comm->c_local_group)) { + + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc:comm_query (%s/%s): intercomm, self-comm, " + "or not all ranks local; disqualifying myself", + ompi_comm_print_cid(comm), comm->c_name); + + return NULL; + } + + int comm_size = ompi_comm_size(comm); + for(int r = 0; r < comm_size; r++) { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r); + + if(proc->super.proc_arch != opal_local_arch) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc:comm_query (%s/%s): All ranks not of the same arch; " + "disabling myself", ompi_comm_print_cid(comm), comm->c_name); + + return NULL; + } + } + + mca_coll_base_module_t *module = + (mca_coll_base_module_t *) OBJ_NEW(mca_coll_xhc_module_t); + + if(module == NULL) { + return NULL; + } + + module->coll_module_enable = mca_coll_xhc_module_enable; + module->coll_module_disable = mca_coll_xhc_module_disable; + + module->coll_barrier = mca_coll_xhc_barrier; + + if(mca_smsc == NULL) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: No opal/smsc support found; " + "only barrier will be enabled"); + + return module; + } + + module->coll_bcast = mca_coll_xhc_bcast; + + if(!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc: Warning: opal/smsc module is not CAN_MAP capable; " + "(all)reduce will be disabled, bcast might see reduced performance"); + + return module; + } + + module->coll_allreduce = mca_coll_xhc_allreduce; + module->coll_reduce = mca_coll_xhc_reduce; + + return module; +} + +#define COLL_FN_HELPER(_m, _api) .coll_ ## _api = (_m)->coll_ ## _api, \ + .coll_ ## _api ## _module = (_m) + +int mca_coll_xhc_module_enable(mca_coll_base_module_t *ompi_module, + ompi_communicator_t *comm) { + + xhc_module_t *module = (xhc_module_t *) ompi_module; + + int ret; + + // --- + + ret = xhc_module_save_fallback_fns(module, comm); + + /* This can/will happen often (see #9885), but theoretically + * isn't a problem, as in these cases the component wouldn't + * end up getting used anyway. */ + if(ret != OMPI_SUCCESS) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc:module_enable (%s/%s): No previous fallback component " + "found; disabling myself", ompi_comm_print_cid(comm), comm->c_name); + + return ret; + } + + // --- + + module->comm_size = ompi_comm_size(comm); + module->rank = ompi_comm_rank(comm); + + module->peer_info = calloc(module->comm_size, sizeof(xhc_peer_info_t)); + + for(int r = 0; r < module->comm_size; r++) { + ompi_proc_t *peer_proc = ompi_comm_peer_lookup(comm, r); + + module->peer_info[r].proc = peer_proc; + module->peer_info[r].locality = peer_proc->super.proc_flags; + } + + module->peer_info[module->rank].locality |= + ((1 << OMPI_XHC_LOC_EXT_BITS) - 1) << OMPI_XHC_LOC_EXT_START; + + // --- + + /* This needs to happen here, and we need to save the hierarchy string, + * because the info value will have been gone by the time lazy_init is + * called. Furthermore, we can't prepeare the hierarchy here, as it might + * required communication (allgather) with the other ranks. */ + + const char *hier_mca = mca_coll_xhc_component.hierarchy_mca; + + opal_cstring_t *hier_info; + int hier_info_flag = 0; + + if(comm->super.s_info != NULL) { + opal_info_get(comm->super.s_info, "ompi_comm_coll_xhc_hierarchy", + &hier_info, &hier_info_flag); + + if(hier_info_flag) { + hier_mca = hier_info->string; + } + } + + module->hierarchy_string = strdup(hier_mca); + + if(hier_info_flag) { + OBJ_RELEASE(hier_info); + } + + if(!module->hierarchy_string) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + // --- + + ret = xhc_component_parse_chunk_sizes(mca_coll_xhc_component.chunk_size_mca, + &module->chunks, &module->chunks_len); + if(ret != OMPI_SUCCESS) { + return ret; + } + + // --- + + xhc_coll_fns_t xhc_fns = (xhc_coll_fns_t) { + COLL_FN_HELPER(ompi_module, allreduce), + COLL_FN_HELPER(ompi_module, barrier), + COLL_FN_HELPER(ompi_module, bcast), + COLL_FN_HELPER(ompi_module, reduce) + }; + + xhc_module_install_fns(module, comm, xhc_fns); + + return OMPI_SUCCESS; +} + +int mca_coll_xhc_module_disable(mca_coll_base_module_t *ompi_module, + ompi_communicator_t *comm) { + + xhc_module_t *module = (xhc_module_t *) ompi_module; + + xhc_module_install_fallback_fns(module, comm, NULL); + mca_coll_xhc_module_destruct(module); + + return OMPI_SUCCESS; +} + +// ----------------------------- + +#define SAVE_FALLBACK_COLL(_comm, _m, _dst, _api) do { \ + if((_m)->coll_ ## _api) { \ + MCA_COLL_SAVE_API(_comm, _api, (_dst).coll_ ## _api, \ + (_dst).coll_ ## _api ## _module, "xhc"); \ + \ + if(!(_dst).coll_ ## _api || !(_dst).coll_ ## _api ## _module) { \ + _save_status = OMPI_ERR_NOT_FOUND; \ + } \ + } \ +} while(0) + +#define INSTALL_FALLBACK_COLL(_comm, _m, _saved, _new, _api) do { \ + if((_comm)->c_coll->coll_ ## _api ## _module == (_m)) { \ + MCA_COLL_SAVE_API(_comm, _api, (_saved).coll_ ## _api, \ + (_saved).coll_ ## _api ## _module, "xhc"); \ + MCA_COLL_INSTALL_API(_comm, _api, (_new).coll_ ## _api, \ + (_new).coll_ ## _api ## _module, "xhc"); \ + } \ +} while(0) + +#define INSTALL_COLL(_comm, _src, _api) do { \ + if((_src).coll_ ## _api) { \ + MCA_COLL_INSTALL_API(_comm, _api, (_src).coll_ ## _api, \ + (_src).coll_ ## _api ## _module, "xhc"); \ + } \ +} while(0) + +/* Save the function pointers of the previous module, in XHC's + * struct. Only the functions that XHC will provide are saved. */ +static int xhc_module_save_fallback_fns( + xhc_module_t *module, ompi_communicator_t *comm) { + + mca_coll_base_module_t *ompi_module = (mca_coll_base_module_t *) module; + + xhc_coll_fns_t colls = {0}; + int _save_status = OMPI_SUCCESS; + + SAVE_FALLBACK_COLL(comm, ompi_module, colls, allreduce); + SAVE_FALLBACK_COLL(comm, ompi_module, colls, barrier); + SAVE_FALLBACK_COLL(comm, ompi_module, colls, bcast); + SAVE_FALLBACK_COLL(comm, ompi_module, colls, reduce); + + if(_save_status == OMPI_SUCCESS) { + module->prev_colls = colls; + } + + return _save_status; +} + +/* Replace XHC's pointers in c_coll with those from the fallback + * component saved earlier. XHC's pointers are conveniently returned + * in prev_fns_dst, to later pass to xhc_module_install_fns. */ +void mca_coll_xhc_module_install_fallback_fns(xhc_module_t *module, + ompi_communicator_t *comm, xhc_coll_fns_t *prev_fns_dst) { + + mca_coll_base_module_t *ompi_module = (mca_coll_base_module_t *) module; + + xhc_coll_fns_t saved = {0}; + + INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, allreduce); + INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, barrier); + INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, bcast); + INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, reduce); + + if(prev_fns_dst) { + *prev_fns_dst = saved; + } +} + +/* */ +void mca_coll_xhc_module_install_fns(xhc_module_t *module, + ompi_communicator_t *comm, xhc_coll_fns_t fns) { + + (void) module; + + INSTALL_COLL(comm, fns, allreduce); + INSTALL_COLL(comm, fns, barrier); + INSTALL_COLL(comm, fns, bcast); + INSTALL_COLL(comm, fns, reduce); +} + +// ----------------------------- + +int mca_coll_xhc_module_prepare_hierarchy( + xhc_module_t *module, ompi_communicator_t *comm) { + + int ret; + + opal_list_t *level_defs; + int nlevel_defs; + + ret = xhc_component_parse_hierarchy(module->hierarchy_string, + &level_defs, &nlevel_defs); + if(ret != OMPI_SUCCESS) { + return ret; + } + + ret = xhc_module_create_hierarchy(module, comm, level_defs, + nlevel_defs, &module->hierarchy, &module->hierarchy_len); + if(ret != OMPI_SUCCESS) { + return ret; + } + + for(int i = 0; i < nlevel_defs; i++) + OPAL_LIST_DESTRUCT(&level_defs[i]); + free(level_defs); + + ret = xhc_module_sort_hierarchy(module, comm, + &module->hierarchy, &module->hierarchy_len); + if(ret != OMPI_SUCCESS) { + return ret; + } + + return OMPI_SUCCESS; +} + +static int xhc_module_create_hierarchy(xhc_module_t *module, + ompi_communicator_t *comm, opal_list_t *level_defs, int nlevel_defs, + xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst) { + + xhc_peer_info_t *peer_info = module->peer_info; + + int comm_size = ompi_comm_size(comm); + int rank = ompi_comm_rank(comm); + + xhc_loc_t *hierarchy = NULL; + int nvirt_hiers = 0; + + int *rank_list; + + opal_hwloc_locality_t *loc_list; + ompi_datatype_t *hwloc_locality_type = NULL; + + int ret, return_code = OMPI_SUCCESS; + + hierarchy = malloc(nlevel_defs * sizeof(xhc_loc_t)); + rank_list = malloc(comm_size * sizeof(int)); + loc_list = malloc(comm_size * sizeof(opal_hwloc_locality_t)); + + if(!hierarchy || !rank_list || !loc_list) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + switch(sizeof(opal_hwloc_locality_t)) { + case 1: hwloc_locality_type = MPI_UINT8_T; break; + case 2: hwloc_locality_type = MPI_UINT16_T; break; + case 4: hwloc_locality_type = MPI_UINT32_T; break; + case 8: hwloc_locality_type = MPI_UINT64_T; break; + } + assert(hwloc_locality_type); + + for(int h = 0; h < nlevel_defs; h++) { + opal_list_t *defs = &level_defs[h]; + + xhc_loc_def_t *my_def = NULL; + xhc_loc_t locality; + + xhc_loc_def_t *def_0 = (xhc_loc_def_t *) opal_list_get_first(defs); + + bool is_virtual = (opal_list_get_size(defs) > 1 || def_0->rank_list + || def_0->split > 1 || def_0->max_ranks > 0); + + if(is_virtual) { + if(nvirt_hiers == OMPI_XHC_LOC_EXT_BITS) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc: Error: Too many virtual hierarchies"); + + RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end); + } + + locality = 1 << (OMPI_XHC_LOC_EXT_START + nvirt_hiers); + nvirt_hiers++; + } else { + locality = def_0->named_loc; + } + + hierarchy[h] = locality; + def_0 = NULL; + + xhc_loc_def_t *def, *def_next; + + /* Handle rank lists; take note if I belong + * in one, and remove them from the mix */ + OPAL_LIST_FOREACH_SAFE(def, def_next, defs, xhc_loc_def_t) { + if(def->rank_list) { + if(!my_def) { + for(int rl = 0; rl < def->rank_list_len; rl++) { + if(rank >= def->rank_list[rl].start_rank + && rank <= def->rank_list[rl].end_rank) { + my_def = def; + break; + } + } + } + + opal_list_remove_item(defs, (opal_list_item_t *) def); + if(def != my_def) { + OBJ_RELEASE(def); + } + } + } + + bool dir_fwd = true; + + /* When multiple locality defitions are present, they are assigned + * to groups in a left-to-right fashion. At every turn, the first + * rank (determined by the minimum ID) that's still not part of + * a locality, as well as the other ranks that are local with it, + * claim/consume the next locality from the list. The direction + * serves to implement the repeat modifier. When it is located, + * the process starts taking place right-to-left following the max + * ID. At the end and after the loop, the repeated locality will + * be the only one left and all remaining ranks will follow it. */ + while(opal_list_get_size(defs) > 1) { + def = (xhc_loc_def_t *) (dir_fwd ? opal_list_get_first(defs) + : opal_list_get_last(defs)); + + if(dir_fwd && def->repeat) { + dir_fwd = false; + continue; + } + + int ticket = (my_def == NULL ? rank : (dir_fwd ? comm_size : -1)); + int chosen; + + ret = comm->c_coll->coll_allreduce(&ticket, &chosen, 1, + MPI_INT, (dir_fwd ? MPI_MIN : MPI_MAX), comm, + comm->c_coll->coll_allreduce_module); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, end); + } + + if(chosen >= 0 && chosen < comm_size + && PEER_IS_LOCAL(peer_info, chosen, def->named_loc)) { + + my_def = def; + } + + opal_list_remove_item(defs, (opal_list_item_t *) def); + if(def != my_def) { + OBJ_RELEASE(def); + } + } + + if(opal_list_get_size(defs) > 0 && !my_def) { + my_def = (xhc_loc_def_t *) opal_list_get_first(defs); + opal_list_remove_item(defs, (opal_list_item_t *) my_def); + } + + /* Share which named locality each rank follows; ranks that + * follow different localities shouldn't be grouped together */ + opal_hwloc_locality_t follow_loc = (my_def ? my_def->named_loc : 0); + ret = comm->c_coll->coll_allgather(&follow_loc, 1, + hwloc_locality_type, loc_list, 1, hwloc_locality_type, + comm, comm->c_coll->coll_allgather_module); + if(ret != OMPI_SUCCESS) { + RETURN_WITH_ERROR(return_code, ret, end); + } + + if(my_def == NULL) { + continue; + } + + int member_id; + int members = 0; + + // If working with rank list, set the ranks from the list as "local" + if(my_def->rank_list) { + for(int i = 0; i < my_def->rank_list_len; i++) { + for(int r = my_def->rank_list[i].start_rank; + r <= my_def->rank_list[i].end_rank && r < comm_size; r++) { + if(r == rank) { + member_id = members; + } + + peer_info[r].locality |= locality; + rank_list[members++] = r; + } + } + } else if(is_virtual) { + /* We might have a named locality instead of a rank list, but if + * we still needed to create a virtual one, we need to apply it */ + for(int r = 0; r < comm_size; r++) { + if(loc_list[r] != my_def->named_loc) { + continue; + } + + if(!PEER_IS_LOCAL(peer_info, r, my_def->named_loc)) { + continue; + } + + if(r == rank) { + member_id = members; + } + + peer_info[r].locality |= locality; + rank_list[members++] = r; + } + } + + /* If split or max ranks was specified, math partition the locality + * and remove the previously added locality mapping to some ranks */ + if(my_def->split > 1) { + int piece_size = members / my_def->split; + int leftover = members % my_def->split; + + for(int m = 0, next_border = 0; m < members; m++) { + if(m == next_border) { + next_border += piece_size + (leftover > 0 ? 1 : 0); + if(leftover > 0) { + leftover--; + } + + if(member_id >= m && member_id < next_border) { + m = next_border - 1; + continue; + } + } + + peer_info[rank_list[m]].locality &= ~locality; + } + } else if(my_def->max_ranks > 1) { + for(int m = 0; m < members; m++) { + if(m % my_def->max_ranks == 0) { + if(member_id >= m && member_id - m < my_def->max_ranks) { + m += my_def->max_ranks - 1; + continue; + } + } + + peer_info[rank_list[m]].locality &= ~locality; + } + } + + OBJ_RELEASE_IF_NOT_NULL(my_def); + } + + *hierarchy_dst = hierarchy; + *hierarchy_len_dst = nlevel_defs; + +end: + + free(rank_list); + + if(return_code != OMPI_SUCCESS) { + free(hierarchy); + } + + return return_code; +} + +static int xhc_module_sort_hierarchy(xhc_module_t *module, + ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst, + int *hierarchy_len_dst) { + + xhc_peer_info_t *peer_info = module->peer_info; + int comm_size = ompi_comm_size(comm); + + xhc_loc_t *old_hier = *hierarchy_dst; + int hier_len = *hierarchy_len_dst; + + xhc_loc_t *new_hier = NULL; + bool *hier_done = NULL; + + int return_code = OMPI_SUCCESS; + + new_hier = malloc((hier_len + 1) * sizeof(xhc_loc_t)); + hier_done = calloc(hier_len, sizeof(bool)); + + if(new_hier == NULL || hier_done == NULL) { + RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end); + } + + bool has_virtual = false; + for(int i = 0; i < hier_len; i++) { + if(old_hier[i] >= (1 << OMPI_XHC_LOC_EXT_START)) { + has_virtual = true; + break; + } + } + + /* If any virtual hierarchy is involved, attempting to sort it is likely + * asking for trouble. Skip the sorting, and only consider adding a top + * common locality. There is a chance it wasn't actually necessary, but + * it never hurts. */ + + if(has_virtual) { + memcpy(new_hier, old_hier, hier_len * sizeof(xhc_loc_t)); + } else { + for(int new_idx = hier_len - 1; new_idx >= 0; new_idx--) { + int max_matches_count = -1; + int max_matches_hier_idx = -1; + + for(int i = 0; i < hier_len; i++) { + if(hier_done[i]) { + continue; + } + + int matches = 0; + + for(int r = 0; r < comm_size; r++) { + if(PEER_IS_LOCAL(peer_info, r, old_hier[i])) { + matches++; + } + } + + if(matches > max_matches_count) { + max_matches_count = matches; + max_matches_hier_idx = i; + } + } + + assert(max_matches_count != -1); + + new_hier[new_idx] = old_hier[max_matches_hier_idx]; + hier_done[max_matches_hier_idx] = true; + } + } + + xhc_loc_t common_locality = (xhc_loc_t) -1; + + for(int r = 0; r < comm_size; r++) { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r); + common_locality &= proc->super.proc_flags; + } + + if(common_locality == 0) { + opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT, + ompi_coll_base_framework.framework_output, + "coll:xhc: Error: There is no locality common " + "to all ranks in the communicator"); + + RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end); + } + + if(hier_len == 0 || (common_locality & new_hier[hier_len - 1]) + != new_hier[hier_len - 1]) { + + new_hier[hier_len] = common_locality; + hier_len++; + } + + REALLOC(new_hier, hier_len, xhc_loc_t); + + free(old_hier); + + *hierarchy_dst = new_hier; + *hierarchy_len_dst = hier_len; + +end: + + free(hier_done); + + if(return_code != OMPI_SUCCESS) { + free(new_hier); + } + + return return_code; +} diff --git a/ompi/mca/coll/xhc/coll_xhc_reduce.c b/ompi/mca/coll/xhc/coll_xhc_reduce.c new file mode 100644 index 00000000000..5f28986fb66 --- /dev/null +++ b/ompi/mca/coll/xhc/coll_xhc_reduce.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) + * Laboratory, ICS Forth. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "mpi.h" + +#include "ompi/constants.h" +#include "ompi/datatype/ompi_datatype.h" +#include "ompi/communicator/communicator.h" +#include "ompi/op/op.h" + +#include "opal/mca/rcache/base/base.h" +#include "opal/util/show_help.h" +#include "opal/util/minmax.h" + +#include "coll_xhc.h" + +int mca_coll_xhc_reduce(const void *sbuf, void *rbuf, + int count, ompi_datatype_t *datatype, ompi_op_t *op, int root, + ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) { + + xhc_module_t *module = (xhc_module_t *) ompi_module; + + // Currently, XHC's reduce only supports root = 0 + if(root == 0) { + return xhc_allreduce_internal(sbuf, rbuf, count, + datatype, op, ompi_comm, ompi_module, false); + } else { + xhc_coll_fns_t fallback = module->prev_colls; + + return fallback.coll_reduce(sbuf, rbuf, count, datatype, + op, root, ompi_comm, fallback.coll_reduce_module); + } +} diff --git a/ompi/mca/coll/xhc/help-coll-xhc.txt b/ompi/mca/coll/xhc/help-coll-xhc.txt new file mode 100644 index 00000000000..453a96df4fc --- /dev/null +++ b/ompi/mca/coll/xhc/help-coll-xhc.txt @@ -0,0 +1,24 @@ +# +# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV) +# Laboratory, ICS Forth. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +[bad-hierarchy-item] +WARNING (coll/xhc) +Unrecognized locality definition '%s' in hierarchy parameter string '%s' +The component won't load +# +[bad-chunk-size-item] +WARNING (coll/xhc) +Malformed item '%s' in chunk size parameter string '%s' +The component won't load +# +[xhc-init-failed] +WARNING (coll/xhc) +Component initialization failed with error code %d +Errno: %d (%s) diff --git a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg b/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg new file mode 100755 index 00000000000..c8f6d8a2da3 --- /dev/null +++ b/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg @@ -0,0 +1,1176 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + NUMA Level + + Socket Level + + + + + + + + System Level + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Cores + + + + + + + + + + NUMA 0Leader + + + + + + + + + P0 + P1 + P2 + P3 + P4 + P5 + P6 + P7 + P8 + P9 + 10 + 11 + 12 + 13 + 14 + 15 + + + P0 + + + + + P8 + + + + P12 + + + + P4 + + + + P0 + + + + P8 + + + + NUMA 1Leader + + + + NUMA 3Leader + + + +