From 7b59f8ebcbce501fedce3c63531e5f366ffff051 Mon Sep 17 00:00:00 2001
From: George Katevenis <gkatev@ics.forth.gr>
Date: Wed, 15 Feb 2023 09:11:41 +0200
Subject: [PATCH 1/2] Add the XHC (XPMEM-based Hierarchical Collectives)
 component

XHC implements hierarchical and topology-aware intra-node
collectives, using XPMEM for inter-process communication.
See the README for more information.

Signed-off-by: George Katevenis <gkatev@ics.forth.gr>
---
 ompi/mca/coll/xhc/Makefile.am                 |   44 +
 ompi/mca/coll/xhc/README.md                   |  282 ++++
 ompi/mca/coll/xhc/coll_xhc.c                  |  748 +++++++++++
 ompi/mca/coll/xhc/coll_xhc.h                  |  514 +++++++
 ompi/mca/coll/xhc/coll_xhc_allreduce.c        | 1121 ++++++++++++++++
 ompi/mca/coll/xhc/coll_xhc_atomic.h           |  116 ++
 ompi/mca/coll/xhc/coll_xhc_barrier.c          |  138 ++
 ompi/mca/coll/xhc/coll_xhc_bcast.c            |  341 +++++
 ompi/mca/coll/xhc/coll_xhc_component.c        |  677 ++++++++++
 ompi/mca/coll/xhc/coll_xhc_module.c           |  721 ++++++++++
 ompi/mca/coll/xhc/coll_xhc_reduce.c           |   41 +
 ompi/mca/coll/xhc/help-coll-xhc.txt           |   24 +
 ompi/mca/coll/xhc/resources/xhc-hierarchy.svg | 1176 +++++++++++++++++
 13 files changed, 5943 insertions(+)
 create mode 100644 ompi/mca/coll/xhc/Makefile.am
 create mode 100644 ompi/mca/coll/xhc/README.md
 create mode 100644 ompi/mca/coll/xhc/coll_xhc.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc.h
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_allreduce.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_atomic.h
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_barrier.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_bcast.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_component.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_module.c
 create mode 100644 ompi/mca/coll/xhc/coll_xhc_reduce.c
 create mode 100644 ompi/mca/coll/xhc/help-coll-xhc.txt
 create mode 100755 ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
diff --git a/ompi/mca/coll/xhc/Makefile.am b/ompi/mca/coll/xhc/Makefile.am
new file mode 100644
index 00000000000..35db0b89c12
--- /dev/null
+++ b/ompi/mca/coll/xhc/Makefile.am
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+#                         Laboratory, ICS Forth. All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+dist_opaldata_DATA = help-coll-xhc.txt
+
+sources = \
+    coll_xhc.h \
+    coll_xhc_atomic.h \
+    coll_xhc.c \
+    coll_xhc_component.c \
+    coll_xhc_module.c \
+    coll_xhc_bcast.c \
+    coll_xhc_barrier.c \
+    coll_xhc_reduce.c \
+    coll_xhc_allreduce.c
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+component_noinst =
+component_install =
+if MCA_BUILD_ompi_coll_xhc_DSO
+component_install += mca_coll_xhc.la
+else
+component_noinst += libmca_coll_xhc.la
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_coll_xhc_la_SOURCES = $(sources)
+mca_coll_xhc_la_LDFLAGS = -module -avoid-version
+mca_coll_xhc_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_coll_xhc_la_SOURCES = $(sources)
+libmca_coll_xhc_la_LDFLAGS = -module -avoid-version
diff --git a/ompi/mca/coll/xhc/README.md b/ompi/mca/coll/xhc/README.md
new file mode 100644
index 00000000000..325170b7179
--- /dev/null
+++ b/ompi/mca/coll/xhc/README.md
@@ -0,0 +1,282 @@
+# XHC: XPMEM-based Hierarchical Collectives
+
+The XHC component, implements hierarchical & topology-aware intra-node MPI
+collectives, utilizing XPMEM in order to achieve efficient shared address space
+memory access between processes.
+
+## Main features
+
+* Constructs an **n-level hierarchy** (i.e. no algorithmic limitation on level
+count), following the system's hardware topology. Ranks/processes are grouped
+together according to their relative locations; this information is known
+thanks to Hwloc, and is obtained via OpenMPI's integrated book-keeping.
+
+    Topological features that can currently be defined (configurable via MCA params):
+
+    - NUMA node
+    - CPU Socket
+    - L1, L2, L3 cache
+    - Hwthread, core
+    - Node/flat (no hierarchy)
+
+    Example of a 3-level XHC hierarchy (sensitivity to numa & socket locality):
+
+    ![Example of 3-level XHC hierarchy](resources/xhc-hierarchy.svg)
+
+    Furthermore, support for custom virtual user-defined hierarchies is
+    available, to aid when fine-grained control over the communication pattern
+    is necessary.
+
+* Support for both **zero-copy** and **copy-in-copy-out** data transportation.
+    - Switchover at static but configurable message size.
+
+    - CICO buffers permanently attached at module initialization
+
+    - Application buffers attached on the fly the first time they appear, saved
+    on and recovered from registration cache in subsequent appearances.
+    (assuming smsc/xpmem)
+
+* Integration with Open MPI's `opal/smsc` (shared-memory-single-copy)
+framework. Selection of `smsc/xpmem` is highly recommended.
+
+    - Bcast support: XPMEM, CMA, KNEM
+    - Allreduce support: XPMEM
+    - Barrier support: *(all, irrelevant)*
+
+* Data-wise **pipelining** across all levels of the hierarchy allows for
+lowering hierarchy-induced start-up overheads. Pipelining also allows for
+interleaving of operations in some collectives (reduce+bcast in allreduce).
+
+* **Lock-free** single-writer synchronization, with cache-line separation where
+necessary/beneficial. Consistency ensured via lightweight memory barriers.
+
+## Configuration options -- MCA params
+
+XHC can be customized via a number of standard Open MPI MCA parameters, though
+defaults that should satisfy a wide number of systems are in place.
+
+The available parameters:
+
+#### *(prepend with "coll_xhc_")*
+*(list may be outdated, please also check `ompi_info` and `coll_xhc_component.c`)*
+
+* **priority** (default `0`): The priority of the coll/xhc component, used
+during the component selection process.
+
+* **print_info** (default `false`): Print information about XHC's generated
+hierarchy and its configuration.
+
+* **shmem_backing** (default `/dev/shm`): Backing directory for shmem files
+used for XHC's synchronization fields and CICO buffers.
+
+* **dynamic_leader** (default `false`): Enables the feature that dynamically
+elects an XHC group leader at each collective (currently only applicable
+to bcast).
+
+* **dynamic_reduce** (default `1`=`non-float`): Controls the
+feature that allows for out-of-order reduction. XHC ranks reduce chunks
+directly from multiple peers' buffers; dynamic reduction allows them to
+temporarily skip a peer when the expected data is not yet prepared, instead of
+stalling. Setting to `2`=`all`, might/will harm reproducibility of float-based
+reductions.
+
+* **coll_xhc_lb_reduce_leader_assist** (default `top,first`): Controls the
+leader-to-member load balancing mode in reductions. If set to none/empty (`""`)
+only non-leader group members perform reductions. With `top` in the list, the
+leader of the top-most level also performs reductions in his group. With
+`first` in the list, leaders will help in the reduction workload for just one
+chunk at the beginning of the operation. If `all` is specified, all group
+members, including the leaders, perform reductions indiscriminately.
+
+* **force_reduce** (default `false`): Force enable the "special" Reduce
+implementation for all calls to MPI_Reduce. This implementation assumes that
+the `rbuf` parameter to MPI_Reduce is valid and appropriately sized for all
+ranks; not just the root -- you have to make sure that this is indeed the case
+with the application at hand. Only works with `root = 0`.
+
+* **hierarchy** (default `"numa,socket"`): A comma separated list of
+topological feature to which XHC's hierarchy-building algorithm should be
+sensitive. `ompi_info` reports the possible values for the parameter.
+
+    - In some ways, this is "just" a suggestion. The resulting hierarchy may
+    not exactly match the requested one. Reasons that this will occur:
+
+        - A requested topological feature does not effectively segment the set
+        of ranks. (eg. `numa` was specified, but all ranks reside in the same
+        NUMA node)
+
+        - No feature that all ranks have in common was provided. This a more
+        intrinsic detail, that you probably don't need to be aware of, but you
+        might come across if eg. you investigate the output of `print_info`. An
+        additional level will automatically be added in this case, no need to
+        worry about it.
+
+            For all intents and purposes, a hierarchy of `numa,socket` is
+            interpreted as "segment the ranks according to NUMA node locality,
+            and then further segment them according to CPU socket locality".
+            Three groups will be created: the intra-NUMA one, the intra-socket
+            one, and an intra-node one.
+
+        - The provided features will automatically be re-ordered when their
+        order does not match their order in the physical system. (unless a
+        virtual hierarchy feature is present in the list)
+
+    - *Virtual Hierarchies*: The string may alternatively also contain "rank
+    lists" which specify exactly which ranks to group together, as well as some
+    other special modifiers. See in
+    `coll_xhc_component.c:xhc_component_parse_hierarchy()` for further
+    explanation as well as syntax information.
+
+* **chunk_size** (default `16K`): The chunk size for the pipelining process.
+Data is processed (eg broadcast, reduced) in this-much sized pieces at once.
+
+    - It's possible to have a different chunk size for each level of the
+    hierarchy, achieved via providing a comma-separated list of sizes (eg.
+    `"16K,16K,128K"`) instead of single one. The sizes in this list's *DO NOT*
+    correspond to the items on hierarchy list; the hierarchy keys might be
+    re-ordered or reduced to match the system, but the chunk sizes will be
+    consumed in the order they are given, left-to-right -> bottom-to-top.
+
+* **uniform_chunks** (default `true`): Automatically optimize the chunk size
+in reduction collectives, according to the message size, so that all members
+will perform equal work.
+
+* **uniform_chunks_min** (default `1K`): The lowest allowed value for the chunk
+size when uniform chunks are enabled. Each worker will reduce at least this much
+data, or we don't bother splitting the workload up.
+
+* **cico_max** (default `1K`): Copy-in-copy-out, instead of single-copy, will be
+used for messages of *cico_max* or less bytes.
+
+*(Removed Parameters)*
+
+* **rcache_max**, **rcache_max_global** *(REMOVED with shift to opal/smsc)*:
+Limit to number of attachments that the registration cache should hold.
+
+    - A case can be made about their usefulness. If desired, should be
+    re-implemented at smsc-level.
+
+## Limitations
+
+- *Intra-node support only*
+    - Usage in multi-node scenarios is possible via OpenMPI's HAN.
+
+- **Heterogeneity**: XHC does not support nodes with non-uniform (rank-wise)
+datatype representations. (determined according to `proc_arch` field)
+
+- **Non-commutative** operators are not supported by XHC's reduction
+collectives. In past versions, they were supported, but only with the flat
+hierarchy configuration; this could make a return at some point.
+
+- XHC's Reduce is not fully complete. Instead, it is a "special" implementation
+of MPI_Reduce, that is realized as a sub-case of XHC's Allreduce.
+
+    - If the caller guarantees that the `rbuf` parameter is valid for all ranks
+    (not just the root), like in Allreduce, this special Reduce can be invoked
+    by specifying `root=-1`, which will trigger a Reduce to rank `0` (the only
+    one currently supported).
+
+        - Current prime use-case: HAN's Allreduce
+
+    - Furthermore, if it is guaranteed that all Reduce calls in an application
+    satisfy the above criteria, see about the `force_reduce` MCA parameter.
+
+    - XHC's Reduce is not yet fully optimized for small messages.
+
+## Building
+
+XHC is built as a standard mca/coll component.
+
+To reap its full benefits, XPMEM support in OpenMPI is required. XHC will build
+and work without it, but the reduction operations will be disabled and
+broadcast will fall-back to less efficient mechanisms (CMA, KNEM).
+
+## Running
+
+In order for the XHC component to be chosen, make sure that its priority is
+higher than other components that provide the collectives of interest; use the
+`coll_xhc_priority` MCA parameter. If a list of collective modules is included
+via the `coll` MCA parameter, make sure XHC is in the list.
+
+* You may also want to add the `--bind-to core` param. Otherwise, the reported
+process localities might be too general, preventing XHC from correctly
+segmenting the system. (`coll_xhc_print_info` will report the generated
+hierarchy)
+
+### Tuning
+
+* Optional: You might wish to manually specify the topological features that
+XHC's hierarchy should conform to. The default is `numa,socket`, which will
+group the processes according to NUMA locality and then further group them
+according to socket locality. See the `coll_xhc_hierarchy` param.
+
+    - Example: `--mca coll_xhc_hierarchy numa,socket`
+    - Example: `--mca coll_xhc_hierarchy numa`
+    - Example: `--mca coll_xhc_hierarchy flat`
+
+    In some systems, small-message Broadcast or the Barrier operation might
+    perform better with a flat tree instead of a hierarchical one. Currently,
+    manual benchmarking is required to accurately determine this.
+
+* Optional: You might wish to tune XHC's chunk size (default `16K`). Use the
+`coll_xhc_chunk_size` param, and try values close to the default and see if
+improvements are observed. You may even try specifying different chunk sizes
+for each hierarchy level -- use the same process, starting from the same chunk
+size for all levels and decreasing/increasing from there.
+
+    - Example: `--mca coll_xhc_chunk_size 16K`
+    - Example: `--mca coll_xhc_chunk_size 16K,32K,128K`
+
+* Optional: If you wish to focus on latencies of small messages, you can try
+altering the cico-to-zcopy switchover point (`coll_xhc_cico_max`, default
+`1K`).
+
+    - Example: `--mca coll_xhc_cico_max 1K`
+
+* Optional: If your application is heavy in Broadcast calls and you suspect
+that specific ranks might be joining the collective with delay and causing
+others to stall waiting for them, you could try enabling dynamic leadership
+(`coll_xhc_dynamic_leader`), and seeing if it marks an improvement.
+
+    - Example: `--mca coll_xhc_dynamic_leader 1`
+
+### Example command lines
+
+*Assuming `PATH` and `LD_LIBRARY_PATH` have been set appropriately.*
+
+Default XHC configuration:  
+`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --bind-to core <application>`
+
+XHC w/ numa-sensitive hierarchy, chunk size @ 16K:  
+`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy numa --mca coll_xhc_chunk_size 16K --bind-to core <application>`
+
+XHC with flat hierarchy (ie. none at all):  
+`$ mpirun --mca coll libnbc,basic,xhc --mca coll_xhc_priority 100 --mca coll_xhc_hierarchy node [--bind-to core] <application>`
+
+## Publications
+
+1. **A framework for hierarchical single-copy MPI collectives on multicore nodes**,  
+*George Katevenis, Manolis Ploumidis, Manolis Marazakis*,  
+IEEE Cluster 2022, Heidelberg, Germany.  
+https://ieeexplore.ieee.org/document/9912729
+
+## Contact
+
+- George Katevenis (gkatev@ics.forth.gr)
+- Manolis Ploumidis (ploumid@ics.forth.gr)
+
+Computer Architecture and VLSI Systems (CARV) Laboratory, ICS Forth
+
+##  Acknowledgments
+
+We thankfully acknowledge the support of the European Commission and the Greek
+General Secretariat for Research and Innovation under the EuroHPC Programme
+through the **DEEP-SEA** project (GA 955606). National contributions from the
+involved state members (including the Greek General Secretariat for Research
+and Innovation) match the EuroHPC funding.
+
+This work is partly supported by project **EUPEX**, which has received funding
+from the European High-Performance Computing Joint Undertaking (JU) under grant
+agreement No 101033975. The JU receives support from the European Union's
+Horizon 2020 re-search and innovation programme and France, Germany, Italy,
+Greece, United Kingdom, Czech Republic, Croatia.
diff --git a/ompi/mca/coll/xhc/coll_xhc.c b/ompi/mca/coll/xhc/coll_xhc.c
new file mode 100644
index 00000000000..d7221ffb37a
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "mpi.h"
+#include "ompi/constants.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+
+#include "opal/mca/rcache/rcache.h"
+#include "opal/mca/shmem/base/base.h"
+#include "opal/mca/smsc/smsc.h"
+
+#include "opal/include/opal/align.h"
+#include "opal/util/show_help.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+static int xhc_comms_make(ompi_communicator_t *ompi_comm,
+    xhc_peer_info_t *peer_info, xhc_comm_t **comms_dst,
+    int *comm_count_dst, xhc_loc_t *hierarchy, int hierarchy_len);
+static void xhc_comms_destroy(xhc_comm_t *comms, int comm_count);
+
+static int xhc_print_info(xhc_module_t *module,
+    ompi_communicator_t *comm, xhc_data_t *data);
+
+static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
+    ompi_communicator_t *ompi_comm, const char *name_chr_s, int name_chr_i);
+static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds);
+static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info);
+
+// ------------------------------------------------
+
+int mca_coll_xhc_lazy_init(xhc_module_t *module, ompi_communicator_t *comm) {
+
+    int comm_size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+
+    opal_shmem_ds_t *peer_cico_ds = NULL;
+    xhc_data_t *data = NULL;
+
+    xhc_coll_fns_t xhc_fns;
+
+    int return_code = OMPI_SUCCESS;
+    int ret;
+
+    errno = 0;
+
+    // ----
+
+    /* XHC requires rank communication during its initialization.
+     * Temporarily apply the saved fallback collective modules,
+     * and restore XHC's after initialization is done. */
+    xhc_module_install_fallback_fns(module, comm, &xhc_fns);
+
+    // ----
+
+    ret = xhc_module_prepare_hierarchy(module, comm);
+    if(ret != OMPI_SUCCESS) {
+        RETURN_WITH_ERROR(return_code, ret, end);
+    }
+
+    // ----
+
+    data = malloc(sizeof(xhc_data_t));
+    peer_cico_ds = malloc(comm_size * sizeof(opal_shmem_ds_t));
+    if(!data || !peer_cico_ds) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    *data = (xhc_data_t) {
+        .comms = NULL,
+        .comm_count = -1,
+
+        .pvt_coll_seq = 0
+    };
+
+    // ----
+
+    if(OMPI_XHC_CICO_MAX > 0) {
+        opal_shmem_ds_t cico_ds;
+
+        void *my_cico = xhc_shmem_create(&cico_ds,
+            OMPI_XHC_CICO_MAX, comm, "cico", 0);
+        if(!my_cico) {
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+        }
+
+        /* Manually "touch" to assert allocation in local NUMA node
+         * (assuming linux's default firt-touch-alloc NUMA policy) */
+        memset(my_cico, 0, OMPI_XHC_CICO_MAX);
+
+        ret = comm->c_coll->coll_allgather(&cico_ds,
+            sizeof(opal_shmem_ds_t), MPI_BYTE, peer_cico_ds,
+            sizeof(opal_shmem_ds_t), MPI_BYTE, comm,
+            comm->c_coll->coll_allgather_module);
+        if(ret != OMPI_SUCCESS) {
+            RETURN_WITH_ERROR(return_code, ret, end);
+        }
+
+        for(int r = 0; r < comm_size; r++) {
+            peer_info[r].cico_ds = peer_cico_ds[r];
+        }
+
+        peer_info[rank].cico_buffer = my_cico;
+    }
+
+    // ----
+
+    /* An XHC communicator is created for each level of the hierarchy.
+     * The hierachy must be in an order of most-specific to most-general. */
+
+    ret = xhc_comms_make(comm, peer_info, &data->comms, &data->comm_count,
+        module->hierarchy, module->hierarchy_len);
+    if(ret != OMPI_SUCCESS) {
+        RETURN_WITH_ERROR(return_code, ret, end);
+    }
+
+    for(int i = 0, c = 0; i < data->comm_count; i++) {
+        data->comms[i].chunk_size = module->chunks[c];
+        c = opal_min(c + 1, module->chunks_len - 1);
+    }
+
+    if(module->chunks_len < data->comm_count) {
+        opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Warning: The chunk sizes count is shorter than the "
+            "hierarchy size; filling in with the last entry provided");
+    } else if(module->chunks_len > data->comm_count) {
+        opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Warning: The chunk size count is larger than the "
+            "hierarchy size; omitting last entries");
+    }
+
+    // ----
+
+    if(mca_coll_xhc_component.print_info) {
+        ret = xhc_print_info(module, comm, data);
+        if(ret != OMPI_SUCCESS) {
+            RETURN_WITH_ERROR(return_code, ret, end);
+        }
+    }
+
+    // ----
+
+    module->data = data;
+    module->init = true;
+
+    end:
+
+    xhc_module_install_fns(module, comm, xhc_fns);
+
+    free(peer_cico_ds);
+
+    if(return_code != 0) {
+        opal_show_help("help-coll-xhc.txt", "xhc-init-failed", true,
+            return_code, errno, strerror(errno));
+
+        xhc_fini(module);
+    }
+
+    return return_code;
+}
+
+void mca_coll_xhc_fini(mca_coll_xhc_module_t *module) {
+    if(module->data) {
+        xhc_data_t *data = module->data;
+
+        if(data->comm_count >= 0) {
+            xhc_comms_destroy(data->comms, data->comm_count);
+        }
+
+        free(data->comms);
+        free(data);
+    }
+
+    if(module->peer_info) {
+        for(int r = 0; r < module->comm_size; r++) {
+            if(module->peer_info[r].cico_buffer) {
+                if(r == module->rank) {
+                    // OMPI issue #11123
+                    // opal_shmem_unlink(&module->peer_info[r].cico_ds);
+                }
+
+                opal_shmem_segment_detach(&module->peer_info[r].cico_ds);
+            }
+
+            if(module->peer_info[r].smsc_ep) {
+                MCA_SMSC_CALL(return_endpoint, module->peer_info[r].smsc_ep);
+            }
+        }
+    }
+}
+
+// ------------------------------------------------
+
+/* This method is where the hierarchy of XHC is constructed; it receives
+ * the hierarchy specifications (hierarchy param) and groups ranks together
+ * among them. The process begins with the first locality in the list. All
+ * ranks that share this locality (determined via the relative peer to peer
+ * distances) become siblings. The one amongst them with the lowest rank
+ * number becomes the manager/leader of the group. The members don't really
+ * need to keep track of the actual ranks of their siblings -- only the rank
+ * of the group's leader/manager, the size of the group, and their own member
+ * ID. The process continues with the next locality, only that now only the
+ * ranks that became leaders in the previous level are eligible (determined
+ * via comm_candidate, see inline comments). */
+static int xhc_comms_make(ompi_communicator_t *ompi_comm,
+        xhc_peer_info_t *peer_info, xhc_comm_t **comms_dst,
+        int *comm_count_dst, xhc_loc_t *hierarchy, int hierarchy_len) {
+
+    int ompi_rank = ompi_comm_rank(ompi_comm);
+    int ompi_size = ompi_comm_size(ompi_comm);
+
+    xhc_comm_t *comms = NULL;
+    int comms_size = 0;
+    int comm_count = 0;
+
+    opal_shmem_ds_t *comm_ctrl_ds;
+    bool *comm_candidate;
+
+    size_t smsc_reg_size = 0;
+
+    int return_code = OMPI_SUCCESS;
+    int ret;
+
+    comms = malloc((comms_size = 5) * sizeof(xhc_comm_t));
+    comm_ctrl_ds = malloc(ompi_size * sizeof(opal_shmem_ds_t));
+    comm_candidate = malloc(ompi_size * sizeof(bool));
+
+    if(!comms || !comm_ctrl_ds || !comm_candidate) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) {
+        smsc_reg_size = mca_smsc_base_registration_data_size();
+    }
+
+    for(int h = 0; h < hierarchy_len; h++) {
+        xhc_comm_t *xc = &comms[comm_count];
+
+        if(comm_count == comms_size) {
+            void *tmp = realloc(comms, (comms_size *= 2) * sizeof(xhc_comm_t));
+            if(!tmp) {
+                RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+            }
+            comms = tmp;
+        }
+
+        *xc = (xhc_comm_t) {
+            .locality = hierarchy[h],
+
+            .size = 0,
+            .manager_rank = -1,
+
+            .member_info = NULL,
+            .reduce_queue = NULL,
+
+            .comm_ctrl = NULL,
+            .member_ctrl = NULL,
+
+            .ctrl_ds = (opal_shmem_ds_t) {0}
+        };
+
+        // ----
+
+        /* Only ranks that were leaders in the previous level are candidates
+         * for this one. Every rank advertises whether others may consider
+         * it for inclusion via an Allgather. */
+
+        bool is_candidate = (comm_count == 0
+            || comms[comm_count - 1].manager_rank == ompi_rank);
+
+        ret = ompi_comm->c_coll->coll_allgather(&is_candidate, 1,
+            MPI_C_BOOL, comm_candidate, 1, MPI_C_BOOL,
+            ompi_comm, ompi_comm->c_coll->coll_allgather_module);
+        if(ret != OMPI_SUCCESS) {
+            RETURN_WITH_ERROR(return_code, ret, comm_error);
+        }
+
+        for(int r = 0; r < ompi_size; r++) {
+
+            /* If on a non-bottom comm, only managers of the previous
+             * comm are "full" members. However, this procedure also has
+             * to take place for the bottom-most comm; even if this is the
+             * current rank's bottom-most comm, it may not actually be so,
+             * for another rank (eg. with some non-symmetric hierarchies). */
+            if(comm_candidate[r] == false) {
+                continue;
+            }
+
+            // Non-local --> not part of the comm :/
+            if(!PEER_IS_LOCAL(peer_info, r, xc->locality)) {
+                continue;
+            }
+
+            /* The member ID means slightly different things whether on the
+             * bottom-most comm or not. On the bottom-most comm, a rank can
+             * either be a "full" member or not. However, on higher-up comms,
+             * if a rank was not a manager on the previous comm, it will not
+             * a "full" member. Instead, it will be a "potential" member, in
+             * that it keeps information about this comm, and is ready to
+             * take over duties and act as a normal member for a specific
+             * collective (eg. dynamic leader feature, or root != manager). */
+            if(r == ompi_rank || (comm_count > 0 && r == comms[comm_count - 1].manager_rank)) {
+                xc->member_id = xc->size;
+            }
+
+            // First rank to join the comm becomes the manager
+            if(xc->manager_rank == -1) {
+                xc->manager_rank = r;
+            }
+
+            xc->size++;
+        }
+
+        /* If there are no local peers in regards to this locality, no
+         * XHC comm is created for this process on this level. */
+        if(xc->size <= 1) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: Locality 0x%04x does not result "
+                "in any new groupings; skipping it", xc->locality);
+
+            /* All ranks must participate in the "control struct sharing"
+             * allgather, even if useless to this rank to some of them */
+
+            ret = ompi_comm->c_coll->coll_allgather(&xc->ctrl_ds,
+                sizeof(opal_shmem_ds_t), MPI_BYTE, comm_ctrl_ds,
+                sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm,
+                ompi_comm->c_coll->coll_allgather_module);
+            if(ret != OMPI_SUCCESS) {
+                RETURN_WITH_ERROR(return_code, ret, comm_error);
+            }
+
+            xhc_comms_destroy(xc, 1);
+            continue;
+        }
+
+        // ----
+
+        /* Init comm stuff */
+
+        xc->member_info = calloc(xc->size, sizeof(xhc_member_info_t));
+        if(xc->member_info == NULL) {
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error);
+        }
+
+        xc->reduce_queue = OBJ_NEW(opal_list_t);
+        if(!xc->reduce_queue) {
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, comm_error);
+        }
+
+        for(int m = 0; m < xc->size - 1; m++) {
+            xhc_rq_item_t *item = OBJ_NEW(xhc_rq_item_t);
+            if(!item) {
+                RETURN_WITH_ERROR(return_code,
+                    OMPI_ERR_OUT_OF_RESOURCE, comm_error);
+            }
+
+            opal_list_append(xc->reduce_queue, (opal_list_item_t *) item);
+        }
+
+        // ----
+
+        // Create shared structs
+        if(ompi_rank == xc->manager_rank) {
+            size_t ctrl_len = sizeof(xhc_comm_ctrl_t) + smsc_reg_size
+                + xc->size * sizeof(xhc_member_ctrl_t);
+
+            char *ctrl_base = xhc_shmem_create(&xc->ctrl_ds, ctrl_len,
+                ompi_comm, "ctrl", comm_count);
+            if(ctrl_base == NULL) {
+                RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error);
+            }
+
+            /* Manually "touch" to assert allocation in local NUMA node
+            * (assuming linux's default firt-touch-alloc NUMA policy) */
+            memset(ctrl_base, 0, ctrl_len);
+
+            xc->comm_ctrl = (void *) ctrl_base;
+            xc->member_ctrl = (void *) (ctrl_base
+                + sizeof(xhc_comm_ctrl_t) + smsc_reg_size);
+        }
+
+        /* The comm's managers share the details of the communication structs
+         * with their children, so that they may attach to them. Because
+         * there's not any MPI communicator formed that includes (only) the
+         * members of the XHC comm, the sharing is achieved with a single
+         * Allgather, instead of a Broadcast inside each XHC comm. */
+
+        ret = ompi_comm->c_coll->coll_allgather(&xc->ctrl_ds,
+            sizeof(opal_shmem_ds_t), MPI_BYTE, comm_ctrl_ds,
+            sizeof(opal_shmem_ds_t), MPI_BYTE, ompi_comm,
+            ompi_comm->c_coll->coll_allgather_module);
+        if(ret != OMPI_SUCCESS) {
+            RETURN_WITH_ERROR(return_code, ret, comm_error);
+        }
+
+        // Attach to manager's shared structs
+        if(ompi_rank != xc->manager_rank) {
+            xc->ctrl_ds = comm_ctrl_ds[xc->manager_rank];
+
+            char *ctrl_base = xhc_shmem_attach(&xc->ctrl_ds);
+            if(ctrl_base == NULL) {
+                RETURN_WITH_ERROR(return_code, OMPI_ERROR, comm_error);
+            }
+
+            xc->comm_ctrl = (void *) ctrl_base;
+            xc->member_ctrl = (void *) (ctrl_base
+                + sizeof(xhc_comm_ctrl_t) + smsc_reg_size);
+        }
+
+        xc->my_member_ctrl = &xc->member_ctrl[xc->member_id];
+        xc->my_member_info = &xc->member_info[xc->member_id];
+
+        // ----
+
+        comm_count++;
+
+        continue;
+
+        comm_error: {
+            xhc_comms_destroy(comms, comm_count+1);
+            comm_count = -1;
+
+            goto end;
+        }
+    }
+
+    REALLOC(comms, comm_count, xhc_comm_t);
+
+    *comms_dst = comms;
+    *comm_count_dst = comm_count;
+
+    end:
+
+    free(comm_ctrl_ds);
+    free(comm_candidate);
+
+    if(return_code != OMPI_SUCCESS) {
+        free(comms);
+    }
+
+    return return_code;
+}
+
+static void xhc_comms_destroy(xhc_comm_t *comms, int comm_count) {
+    bool is_manager = true;
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(xc->member_id != 0) {
+            is_manager = false;
+        }
+
+        free(xc->member_info);
+
+        if(xc->reduce_queue) {
+            OPAL_LIST_RELEASE(xc->reduce_queue);
+        }
+
+        if(xc->comm_ctrl) {
+            if(is_manager) {
+                // OMPI issue #11123
+                // opal_shmem_unlink(&xc->ctrl_ds);
+                (void) is_manager;
+            }
+
+            opal_shmem_segment_detach(&xc->ctrl_ds);
+        }
+
+        *xc = (xhc_comm_t) {0};
+    }
+}
+
+static int xhc_print_info(xhc_module_t *module,
+        ompi_communicator_t *comm, xhc_data_t *data) {
+
+    int rank = ompi_comm_rank(comm);
+    int ret;
+
+    if(rank == 0) {
+        char *drval_str;
+        char *lb_rla_str;
+        char *un_min_str;
+
+        switch(mca_coll_xhc_component.dynamic_reduce) {
+            case OMPI_XHC_DYNAMIC_REDUCE_DISABLED:
+                drval_str = "OFF"; break;
+            case OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT:
+                drval_str = "ON (non-float)"; break;
+            case OMPI_XHC_DYNAMIC_REDUCE_ALL:
+                drval_str = "ON (all)"; break;
+            default:
+                drval_str = "???";
+        }
+
+        switch(mca_coll_xhc_component.lb_reduce_leader_assist) {
+            case OMPI_XHC_LB_RLA_TOP_LEVEL:
+                lb_rla_str = "top level"; break;
+            case OMPI_XHC_LB_RLA_FIRST_CHUNK:
+                lb_rla_str = "first chunk"; break;
+            case OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK:
+                lb_rla_str = "top level + first chunk"; break;
+            case OMPI_XHC_LB_RLA_ALL:
+                lb_rla_str = "all"; break;
+            default:
+                lb_rla_str = "???";
+        }
+
+        ret = opal_asprintf(&un_min_str, " (min '%zu' bytes)",
+            mca_coll_xhc_component.uniform_chunks_min);
+        if(ret < 0) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+
+        printf("------------------------------------------------\n"
+            "OMPI coll/xhc @ %s, priority %d\n"
+            "  dynamic leader '%s', dynamic reduce '%s'\n"
+            "  reduce load-balancing leader-assist '%s'\n"
+            "  allreduce uniform chunks '%s'%s\n"
+            "  CICO up until %zu bytes, barrier root %d\n\n"
+            "------------------------------------------------\n",
+            comm->c_name, mca_coll_xhc_component.priority,
+            (mca_coll_xhc_component.dynamic_leader ? "ON" : "OFF"),
+            drval_str, lb_rla_str,
+            (mca_coll_xhc_component.uniform_chunks ? "ON" : "OFF"),
+            (mca_coll_xhc_component.uniform_chunks ? un_min_str : ""),
+            mca_coll_xhc_component.cico_max,
+            mca_coll_xhc_component.barrier_root);
+
+        free(un_min_str);
+    }
+
+    for(int i = 0; i < data->comm_count; i++) {
+        char *mlist = NULL;
+        char *tmp;
+
+        ret = opal_asprintf(&mlist, "%d", data->comms[i].manager_rank);
+        if(ret < 0) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+
+        for(int m = 1; m < data->comms[i].size; m++) {
+            if(m == data->comms[i].member_id) {
+                if(i == 0 || data->comms[i-1].manager_rank == rank) {
+                    ret = opal_asprintf(&tmp, "%s %d", mlist, rank);
+                } else {
+                    ret = opal_asprintf(&tmp, "%s _", mlist);
+                }
+            } else {
+                ret = opal_asprintf(&tmp, "%s x", mlist);
+            }
+
+            free(mlist);
+            mlist = tmp;
+
+            if(ret < 0) {
+                return OMPI_ERR_OUT_OF_RESOURCE;
+            }
+        }
+
+        printf("XHC comm loc=0x%08x chunk_size=%zu with %d members [%s]\n",
+            data->comms[i].locality, data->comms[i].chunk_size,
+            data->comms[i].size, mlist);
+
+        free(mlist);
+    }
+
+    return OMPI_SUCCESS;
+}
+
+// ------------------------------------------------
+
+static void *xhc_shmem_create(opal_shmem_ds_t *seg_ds, size_t size,
+        ompi_communicator_t *ompi_comm, const char *name_chr_s, int name_chr_i) {
+
+    char *shmem_file;
+    int ret;
+
+    // xhc_shmem_seg.<UID>@<HOST>.<JOBID>.<RANK@COMM_WORLD>:<CID>_<CHRS>:<CHRI>
+
+    ret = opal_asprintf(&shmem_file, "%s" OPAL_PATH_SEP "xhc_shmem_seg.%u@%s.%x.%d:%d_%s:%d",
+        mca_coll_xhc_component.shmem_backing, geteuid(), opal_process_info.nodename,
+        OPAL_PROC_MY_NAME.jobid, ompi_comm_rank(MPI_COMM_WORLD), ompi_comm_get_local_cid(ompi_comm),
+        name_chr_s, name_chr_i);
+
+    if(ret < 0) {
+        return NULL;
+    }
+
+    // Not 100% sure what this does!, copied from btl/sm
+    opal_pmix_register_cleanup(shmem_file, false, false, false);
+
+    ret = opal_shmem_segment_create(seg_ds, shmem_file, size);
+
+    free(shmem_file);
+
+    if(ret != OPAL_SUCCESS) {
+        opal_output_verbose(MCA_BASE_VERBOSE_ERROR,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Error: Could not create shared memory segment");
+
+        return NULL;
+    }
+
+    void *addr = xhc_shmem_attach(seg_ds);
+
+    if(addr == NULL) {
+        opal_shmem_unlink(seg_ds);
+    }
+
+    return addr;
+}
+
+static void *xhc_shmem_attach(opal_shmem_ds_t *seg_ds) {
+    void *addr = opal_shmem_segment_attach(seg_ds);
+
+    if(addr == NULL) {
+        opal_output_verbose(MCA_BASE_VERBOSE_ERROR,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Error: Could not attach to shared memory segment");
+    }
+
+    return addr;
+}
+
+static mca_smsc_endpoint_t *xhc_smsc_ep(xhc_peer_info_t *peer_info) {
+    if(!peer_info->smsc_ep) {
+        peer_info->smsc_ep = MCA_SMSC_CALL(get_endpoint, &peer_info->proc->super);
+
+        if(!peer_info->smsc_ep) {
+            opal_output_verbose(MCA_BASE_VERBOSE_ERROR,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Error: Failed to initialize smsc endpoint");
+
+            return NULL;
+        }
+    }
+
+    return peer_info->smsc_ep;
+}
+
+// ------------------------------------------------
+
+void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank) {
+    if(OMPI_XHC_CICO_MAX == 0) {
+        return NULL;
+    }
+
+    if(peer_info[rank].cico_buffer == NULL) {
+        peer_info[rank].cico_buffer = xhc_shmem_attach(&peer_info[rank].cico_ds);
+    }
+
+    return peer_info[rank].cico_buffer;
+}
+
+int mca_coll_xhc_copy_expose_region(void *base, size_t len, xhc_copy_data_t **region_data) {
+    if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION)) {
+        void *data = MCA_SMSC_CALL(register_region, base, len);
+
+        if(data == NULL) {
+            opal_output_verbose(MCA_BASE_VERBOSE_ERROR,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Error: Failed to register memory region with smsc");
+
+            return -1;
+        }
+
+        *region_data = data;
+    }
+
+    return 0;
+}
+
+void mca_coll_xhc_copy_region_post(void *dst, xhc_copy_data_t *region_data) {
+    memcpy(dst, region_data, mca_smsc_base_registration_data_size());
+}
+
+int mca_coll_xhc_copy_from(xhc_peer_info_t *peer_info,
+        void *dst, void *src, size_t size, void *access_token) {
+
+    mca_smsc_endpoint_t *smsc_ep = xhc_smsc_ep(peer_info);
+
+    if(smsc_ep == NULL) {
+        return -1;
+    }
+
+    int status = MCA_SMSC_CALL(copy_from, smsc_ep,
+        dst, src, size, access_token);
+
+    return (status == OPAL_SUCCESS ? 0 : -1);
+}
+
+void mca_coll_xhc_copy_close_region(xhc_copy_data_t *region_data) {
+    if(mca_smsc_base_has_feature(MCA_SMSC_FEATURE_REQUIRE_REGISTRATION))
+        MCA_SMSC_CALL(deregister_region, region_data);
+}
+
+void *mca_coll_xhc_get_registration(xhc_peer_info_t *peer_info,
+        void *peer_vaddr, size_t size, xhc_reg_t **reg) {
+
+    mca_smsc_endpoint_t *smsc_ep = xhc_smsc_ep(peer_info);
+
+    if(smsc_ep == NULL) {
+        return NULL;
+    }
+
+    /* MCA_RCACHE_FLAGS_PERSIST will cause the registration to stick around.
+     * Though actually, because smsc/xpmem initializes the ref count to 2,
+     * as a means of keeping the registration around (instead of using the
+     * flag), our flag here doesn't have much effect. If at some point we
+     * would wish to actually detach memory in some or all cases, we should
+     * either call the unmap method twice, or reach out to Open MPI devs and
+     * inquire about the ref count. */
+
+    void *local_ptr;
+
+    *reg = MCA_SMSC_CALL(map_peer_region, smsc_ep,
+        MCA_RCACHE_FLAGS_PERSIST, peer_vaddr, size, &local_ptr);
+
+    if(*reg == NULL) {
+        return NULL;
+    }
+
+    return local_ptr;
+}
+
+/* Won't actually unmap/detach, since we've set
+ * the "persist" flag while creating the mapping */
+void mca_coll_xhc_return_registration(xhc_reg_t *reg) {
+    MCA_SMSC_CALL(unmap_peer_region, reg);
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc.h b/ompi/mca/coll/xhc/coll_xhc.h
new file mode 100644
index 00000000000..0de32f03b46
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_COLL_XHC_EXPORT_H
+#define MCA_COLL_XHC_EXPORT_H
+
+#include "ompi_config.h"
+
+#include <stdint.h>
+#include <limits.h>
+
+#include "mpi.h"
+
+#include "ompi/mca/mca.h"
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op.h"
+
+#include "opal/mca/shmem/shmem.h"
+#include "opal/mca/smsc/smsc.h"
+
+#include "coll_xhc_atomic.h"
+
+#define RETURN_WITH_ERROR(var, err, label) do {(var) = (err); goto label;} \
+    while(0)
+
+#define OBJ_RELEASE_IF_NOT_NULL(obj) do {if((obj) != NULL) OBJ_RELEASE(obj);} while(0)
+
+#define REALLOC(p, s, t) do {void *_tmp = realloc(p, (s)*sizeof(t)); \
+    if(_tmp) (p) = _tmp;} while(0)
+
+#define PEER_IS_LOCAL(peer_info, rank, loc) \
+    (((peer_info)[(rank)].locality & (loc)) == (loc))
+
+#define OMPI_XHC_LOC_EXT_BITS (8*(sizeof(xhc_loc_t) - sizeof(opal_hwloc_locality_t)))
+#define OMPI_XHC_LOC_EXT_START (8*sizeof(opal_hwloc_locality_t))
+
+// ---
+
+#define OMPI_XHC_ACK_WIN 0
+
+// Align to CPU cache line (portable way to obtain it?)
+#define OMPI_XHC_ALIGN 64
+
+// Call opal_progress every this many ticks when busy-waiting
+#define OMPI_XHC_OPAL_PROGRESS_CYCLE 10000
+
+/* Reduction leader-member load balancing, AKA should leaders reduce data?
+ * Normally, non-leaders reduce and leaders propagate. But there are instances
+ * where leaders can/should also help with the group's reduction load.
+ *
+ * OMPI_XHC_LB_RLA_TOP_LEVEL: The top level's leader performs reductions
+ *   on the top level as if a common member
+ *
+ * OMPI_XHC_LB_RLA_FIRST_CHUNK: Leaders reduce only a single chunk, on
+ *   each level, at the beginning of the operation
+ *
+ * (OMPI_XHC_LB_RLA_TOP_LEVEL and OMPI_XHC_LB_RLA_FIRST_CHUNK are combinable)
+ *
+ * OMPI_XHC_LB_RLM_ALL: All leaders performs reductions exactly as if
+ *   common members
+ *
+ * Generally, we might not want leaders reducing, as that may lead to load
+ * imbalance, since they will also have to reduce the comm's result(s)
+ * on upper levels. Unless a leader is also one on all levels! (e.g. the
+ * top-level leader). This leader should probably be assisting in the
+ * reduction; otherwise, the only thing he will be doing is checking
+ * and updating synchronization flags.
+ *
+ * Regarding the load balancing problem, the leaders will actually not have
+ * anything to do until the first chunk is reduced, so they might as well be
+ * made to help the other members with this first chunk. Keep in mind though,
+ * this might increase the memory load, and cause this first chunk to take
+ * slightly more time to be produced. */
+#define OMPI_XHC_LB_RLA_TOP_LEVEL 0x01
+#define OMPI_XHC_LB_RLA_FIRST_CHUNK 0x02
+#define OMPI_XHC_LB_RLA_ALL 0x80
+
+enum {
+    OMPI_XHC_DYNAMIC_REDUCE_DISABLED,
+    OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT,
+    OMPI_XHC_DYNAMIC_REDUCE_ALL
+};
+
+#define OMPI_XHC_CICO_MAX (mca_coll_xhc_component.cico_max)
+
+/* For other configuration options and default
+ * values check coll_xhc_component.c */
+
+// ---
+
+BEGIN_C_DECLS
+
+// ----------------------------------------
+
+typedef uint32_t xhc_loc_t;
+typedef void xhc_reg_t;
+typedef void xhc_copy_data_t;
+
+typedef struct mca_coll_xhc_component_t mca_coll_xhc_component_t;
+typedef struct mca_coll_xhc_module_t mca_coll_xhc_module_t;
+typedef struct mca_coll_xhc_module_t xhc_module_t;
+
+typedef struct xhc_coll_fns_t xhc_coll_fns_t;
+typedef struct xhc_peer_info_t xhc_peer_info_t;
+
+typedef struct xhc_data_t xhc_data_t;
+typedef struct xhc_comm_t xhc_comm_t;
+
+typedef struct xhc_comm_ctrl_t xhc_comm_ctrl_t;
+typedef struct xhc_member_ctrl_t xhc_member_ctrl_t;
+typedef struct xhc_member_info_t xhc_member_info_t;
+
+typedef struct xhc_reduce_area_t xhc_reduce_area_t;
+typedef struct xhc_reduce_queue_item_t xhc_rq_item_t;
+
+typedef struct xhc_rank_range_t xhc_rank_range_t;
+typedef struct xhc_loc_def_t xhc_loc_def_t;
+
+OMPI_DECLSPEC extern mca_coll_xhc_component_t mca_coll_xhc_component;
+OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_xhc_module_t);
+OMPI_DECLSPEC OBJ_CLASS_DECLARATION(xhc_rq_item_t);
+OMPI_DECLSPEC OBJ_CLASS_DECLARATION(xhc_loc_def_item_t);
+
+// ----------------------------------------
+
+struct xhc_coll_fns_t {
+    mca_coll_base_module_allreduce_fn_t coll_allreduce;
+    mca_coll_base_module_t *coll_allreduce_module;
+
+    mca_coll_base_module_barrier_fn_t coll_barrier;
+    mca_coll_base_module_t *coll_barrier_module;
+
+    mca_coll_base_module_bcast_fn_t coll_bcast;
+    mca_coll_base_module_t *coll_bcast_module;
+
+    mca_coll_base_module_reduce_fn_t coll_reduce;
+    mca_coll_base_module_t *coll_reduce_module;
+};
+
+struct mca_coll_xhc_component_t {
+    mca_coll_base_component_t super;
+
+    int priority;
+    bool print_info;
+
+    char *shmem_backing;
+
+    bool dynamic_leader;
+
+    int barrier_root;
+
+    int dynamic_reduce;
+    int lb_reduce_leader_assist;
+
+    bool force_reduce;
+
+    bool uniform_chunks;
+    size_t uniform_chunks_min;
+
+    size_t cico_max;
+
+    char *hierarchy_mca;
+    char *chunk_size_mca;
+};
+
+struct mca_coll_xhc_module_t {
+    mca_coll_base_module_t super;
+
+    /* pointers to functions/modules of
+     * previous coll components for fallback */
+    xhc_coll_fns_t prev_colls;
+
+    // copied from comm
+    int comm_size;
+    int rank;
+
+    // list of localities to consider during grouping
+    char *hierarchy_string;
+    xhc_loc_t *hierarchy;
+    int hierarchy_len;
+
+    // list of requested chunk sizes, to be applied to comms
+    size_t *chunks;
+    int chunks_len;
+
+    // temporary (private) internal buffer, for methods like Reduce
+    void *rbuf;
+    size_t rbuf_size;
+
+    // xhc-specific info for every other rank in the comm
+    xhc_peer_info_t *peer_info;
+
+    xhc_data_t *data;
+
+    bool init;
+};
+
+struct xhc_peer_info_t {
+    xhc_loc_t locality;
+
+    ompi_proc_t *proc;
+    mca_smsc_endpoint_t *smsc_ep;
+
+    opal_shmem_ds_t cico_ds;
+    void *cico_buffer;
+};
+
+struct xhc_data_t {
+    xhc_comm_t *comms;
+    int comm_count;
+
+    xf_sig_t pvt_coll_seq;
+};
+
+struct xhc_comm_t {
+    xhc_loc_t locality;
+    size_t chunk_size;
+
+    int size;
+    int manager_rank;
+    int member_id;
+
+    // ---
+
+    // Am I a leader in the current collective?
+    bool is_coll_leader;
+
+    // Have handshaked with all members in the current op? (useful to leader)
+    bool all_joined;
+
+    /* A reduce set defines a range/area of data to be reduced, and its
+     * settings. We require multiple areas, because there might be different
+     * circumstances:
+     *
+     * 1. Under certain load balancing policies, leaders perform reductions
+     *    for the just one chunk, and then they don't. Thus, the worker count
+     *    changes, and the settings have to recomputed for the next areas.
+     *
+     * 2. During the "middle" of the operation, all members continuously
+     *    reduce data in maximum-sized pieces (according to the configured
+     *    chunk size). But, towards the end of the operation, the remaining
+     *    elements are less than ((workers * elem_chunk)), we have to
+     *    recalculate `elem_chunk`, so that all workers will perform
+     *    equal work. */
+    struct xhc_reduce_area_t {
+        int start; // where the area begins
+        int len; // the size of the area
+        int workers; // how many processes perform reductions in the area
+        int stride; /* how much to advance inside the area after
+                     * each reduction, unused for non-combo areas */
+
+        // local process settings
+        int work_begin; // where to begin the first reduction from
+        int work_end; // up to where to reduce
+        int work_chunk; // how much to reduce each time
+        int work_leftover; /* assigned leftover elements to include as
+                            * part of the last reduction in the area */
+    } reduce_area[3];
+    int n_reduce_areas;
+
+    struct xhc_member_info_t {
+        xhc_reg_t *sbuf_reg, *rbuf_reg;
+        void *sbuf, *rbuf;
+        bool init;
+    } *member_info;
+
+    // Queue to keep track of individual reduction progress for different peers
+    opal_list_t *reduce_queue;
+
+    // ---
+
+    xhc_comm_ctrl_t *comm_ctrl;
+    xhc_member_ctrl_t *member_ctrl;
+
+    opal_shmem_ds_t ctrl_ds;
+
+    // ---
+
+    xhc_member_ctrl_t *my_member_ctrl; // = &member_ctrl[member_id]
+    xhc_member_info_t *my_member_info; // = &member_info[member_id]
+};
+
+struct xhc_comm_ctrl_t {
+    // We want leader_seq, coll_ack, coll_seq to all lie in their own cache lines
+
+    volatile xf_sig_t leader_seq;
+
+    volatile xf_sig_t coll_ack __attribute__((aligned(OMPI_XHC_ALIGN)));
+
+    volatile xf_sig_t coll_seq __attribute__((aligned(OMPI_XHC_ALIGN)));
+
+    /* - Reason *NOT* to keep below fields in the same cache line as coll_seq:
+     *
+     *   While members busy-wait on leader's coll_seq, initializing the rest of
+     *   the fields will trigger cache-coherency-related "invalidate" and then
+     *   "read miss" messages, for each store.
+     *
+     * - Reason to *DO* keep below fields in the same cache line as coll_seq:
+     *
+     *   Members load from coll_seq, and implicitly fetch the entire cache
+     *   line, which also contains the values of the other fields, that will
+     *   also need to be loaded soon.
+     *
+     * (not 100% sure of my description here)
+     *
+     * Bcast seemed to perform better with the second option, so I went with
+     * that one. The best option might also be influenced by the ranks' order
+     * of entering in the operation.
+     */
+
+    // "Guarded" by members' coll_seq
+    volatile int leader_id;
+    volatile int leader_rank;
+    volatile int cico_id;
+
+    void* volatile data_vaddr;
+    volatile xf_size_t bytes_ready;
+
+    char access_token[];
+} __attribute__((aligned(OMPI_XHC_ALIGN)));
+
+struct xhc_member_ctrl_t {
+    volatile xf_sig_t member_ack; // written by member
+
+    // written by member, at beginning of operation
+    volatile xf_sig_t member_seq __attribute__((aligned(OMPI_XHC_ALIGN)));
+    volatile int rank;
+
+    void* volatile sbuf_vaddr;
+    void* volatile rbuf_vaddr;
+    volatile int cico_id;
+
+    // reduction progress counters, written by member
+    volatile xf_int_t reduce_ready;
+    volatile xf_int_t reduce_done;
+} __attribute__((aligned(OMPI_XHC_ALIGN)));
+
+struct xhc_reduce_queue_item_t {
+    opal_list_item_t super;
+    int member; // ID of member
+    int count; // current reduction progress for member
+    int area_id; // current reduce area
+};
+
+// ----------------------------------------
+
+struct xhc_rank_range_t {
+    int start_rank, end_rank;
+};
+
+struct xhc_loc_def_t {
+    opal_list_item_t super;
+
+    opal_hwloc_locality_t named_loc;
+
+    xhc_rank_range_t *rank_list;
+    int rank_list_len;
+
+    int split;
+    int max_ranks;
+
+    bool repeat;
+};
+
+// ----------------------------------------
+
+// coll_xhc_component.c
+// --------------------
+
+#define xhc_component_parse_hierarchy(...) mca_coll_xhc_component_parse_hierarchy(__VA_ARGS__)
+#define xhc_component_parse_chunk_sizes(...) mca_coll_xhc_component_parse_chunk_sizes(__VA_ARGS__)
+
+int mca_coll_xhc_component_init_query(bool enable_progress_threads,
+    bool enable_mpi_threads);
+
+int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
+    opal_list_t **level_defs_dst, int *nlevel_defs_dst);
+int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str,
+    size_t **vals_dst, int *len_dst);
+
+// coll_xhc_module.c
+// -----------------
+
+#define xhc_module_install_fns(...) mca_coll_xhc_module_install_fns(__VA_ARGS__)
+#define xhc_module_install_fallback_fns(...) mca_coll_xhc_module_install_fallback_fns(__VA_ARGS__)
+
+#define xhc_module_prepare_hierarchy(...) mca_coll_xhc_module_prepare_hierarchy(__VA_ARGS__)
+
+mca_coll_base_module_t *mca_coll_xhc_module_comm_query(
+    ompi_communicator_t *comm, int *priority);
+
+int mca_coll_xhc_module_enable(mca_coll_base_module_t *module,
+    ompi_communicator_t *comm);
+int mca_coll_xhc_module_disable(mca_coll_base_module_t *module,
+    ompi_communicator_t *comm);
+
+void mca_coll_xhc_module_install_fallback_fns(xhc_module_t *module,
+	ompi_communicator_t *comm, xhc_coll_fns_t *prev_fns_dst);
+void mca_coll_xhc_module_install_fns(xhc_module_t *module,
+	ompi_communicator_t *comm, xhc_coll_fns_t fns);
+
+int mca_coll_xhc_module_prepare_hierarchy(mca_coll_xhc_module_t *module,
+    ompi_communicator_t *comm);
+
+// coll_xhc.c
+// ----------
+
+#define xhc_lazy_init(...) mca_coll_xhc_lazy_init(__VA_ARGS__)
+#define xhc_fini(...) mca_coll_xhc_fini(__VA_ARGS__)
+
+#define xhc_get_cico(...) mca_coll_xhc_get_cico(__VA_ARGS__)
+
+#define xhc_copy_expose_region(...) mca_coll_xhc_copy_expose_region(__VA_ARGS__)
+#define xhc_copy_region_post(...) mca_coll_xhc_copy_region_post(__VA_ARGS__)
+#define xhc_copy_from(...) mca_coll_xhc_copy_from(__VA_ARGS__)
+#define xhc_copy_close_region(...) mca_coll_xhc_copy_close_region(__VA_ARGS__)
+
+#define xhc_get_registration(...) mca_coll_xhc_get_registration(__VA_ARGS__)
+#define xhc_return_registration(...) mca_coll_xhc_return_registration(__VA_ARGS__)
+
+int mca_coll_xhc_lazy_init(mca_coll_xhc_module_t *module, ompi_communicator_t *comm);
+void mca_coll_xhc_fini(mca_coll_xhc_module_t *module);
+
+void *mca_coll_xhc_get_cico(xhc_peer_info_t *peer_info, int rank);
+
+int mca_coll_xhc_copy_expose_region(void *base, size_t len, xhc_copy_data_t **region_data);
+void mca_coll_xhc_copy_region_post(void *dst, xhc_copy_data_t *region_data);
+int mca_coll_xhc_copy_from(xhc_peer_info_t *peer_info, void *dst,
+    void *src, size_t size, void *access_token);
+void mca_coll_xhc_copy_close_region(xhc_copy_data_t *region_data);
+
+void *mca_coll_xhc_get_registration(xhc_peer_info_t *peer_info,
+    void *peer_vaddr, size_t size, xhc_reg_t **reg);
+void mca_coll_xhc_return_registration(xhc_reg_t *reg);
+
+// Primitives (respective file)
+// ----------------------------
+
+int mca_coll_xhc_bcast(void *buf, int count, ompi_datatype_t *datatype,
+    int root, ompi_communicator_t *comm, mca_coll_base_module_t *module);
+
+int mca_coll_xhc_barrier(ompi_communicator_t *ompi_comm,
+    mca_coll_base_module_t *module);
+
+int mca_coll_xhc_reduce(const void *sbuf, void *rbuf,
+    int count, ompi_datatype_t *datatype, ompi_op_t *op, int root,
+    ompi_communicator_t *comm, mca_coll_base_module_t *module);
+
+int mca_coll_xhc_allreduce(const void *sbuf, void *rbuf,
+    int count, ompi_datatype_t *datatype, ompi_op_t *op,
+    ompi_communicator_t *comm, mca_coll_base_module_t *module);
+
+// Miscellaneous
+// -------------
+
+#define xhc_allreduce_internal(...) mca_coll_xhc_allreduce_internal(__VA_ARGS__)
+
+int mca_coll_xhc_allreduce_internal(const void *sbuf, void *rbuf, int count,
+    ompi_datatype_t *datatype, ompi_op_t *op, ompi_communicator_t *ompi_comm,
+    mca_coll_base_module_t *module, bool require_bcast);
+
+// ----------------------------------------
+
+// Rollover-safe check that flag has reached/exceeded thresh, with max deviation
+static inline bool CHECK_FLAG(volatile xf_sig_t *flag,
+        xf_sig_t thresh, xf_sig_t win) {
+
+    // This is okay because xf_sig_t is unsigned. Take care.
+    // The cast's necessity is dependent on the size of xf_sig_t
+    return ((xf_sig_t) (*flag - thresh) <= win);
+}
+
+static inline void WAIT_FLAG(volatile xf_sig_t *flag,
+        xf_sig_t thresh, xf_sig_t win) {
+    bool ready = false;
+
+    do {
+        for(int i = 0; i < OMPI_XHC_OPAL_PROGRESS_CYCLE; i++) {
+            if(CHECK_FLAG(flag, thresh, win)) {
+                ready = true;
+                break;
+            }
+
+            /* xf_sig_t f = *flag;
+            if(CHECK_FLAG(&f, thresh, win)) {
+                ready = true;
+                break;
+            } else if(CHECK_FLAG(&f, thresh, 1000))
+                printf("Debug: Flag check with window %d failed, "
+                    "but succeeded with window 1000. flag = %d, "
+                    "thresh = %d\n", win, f, thresh); */
+        }
+
+        if(!ready) {
+            opal_progress();
+        }
+    } while(!ready);
+}
+
+// ----------------------------------------
+
+END_C_DECLS
+
+#endif
diff --git a/ompi/mca/coll/xhc/coll_xhc_allreduce.c b/ompi/mca/coll/xhc/coll_xhc_allreduce.c
new file mode 100644
index 00000000000..d45065b9dc0
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_allreduce.c
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/op/op.h"
+
+#include "opal/mca/rcache/base/base.h"
+#include "opal/util/show_help.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+#define MAX_REDUCE_AREAS(comm) \
+    ((int)(sizeof((comm)->reduce_area)/sizeof((comm)->reduce_area[0])))
+
+OBJ_CLASS_INSTANCE(xhc_rq_item_t, opal_list_item_t, NULL, NULL);
+
+// -----------------------------
+
+/* For the reduction areas, see comments in xhc_reduce_area_t's definition.
+ * For the leader reduction assistance policies see the flag definitions. */
+static void init_reduce_areas(xhc_comm_t *comms,
+        int comm_count, int allreduce_count, size_t dtype_size) {
+
+    bool uniform_chunks = mca_coll_xhc_component.uniform_chunks;
+    int lb_rla = mca_coll_xhc_component.lb_reduce_leader_assist;
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        int avail_workers[MAX_REDUCE_AREAS(xc)];
+
+        for(int area_id = 0; area_id < MAX_REDUCE_AREAS(xc); area_id++) {
+            int workers = xc->size - 1;
+
+            if(lb_rla & OMPI_XHC_LB_RLA_TOP_LEVEL) {
+                if(i == comm_count - 1 && workers < xc->size)
+                    workers++;
+            }
+
+            if(lb_rla & OMPI_XHC_LB_RLA_FIRST_CHUNK) {
+                if(area_id == 0 && workers < xc->size)
+                    workers++;
+            }
+
+            if(lb_rla & OMPI_XHC_LB_RLA_ALL) {
+                workers = xc->size;
+            }
+
+            avail_workers[area_id] = workers;
+        }
+
+        // Min/max work that a worker may perform (one step)
+        int min_elems = mca_coll_xhc_component.uniform_chunks_min / dtype_size;
+        int max_elems = xc->chunk_size / dtype_size;
+
+        int area_id = 0, el_idx = 0;
+
+        while(area_id < MAX_REDUCE_AREAS(xc) && el_idx < allreduce_count) {
+            xhc_reduce_area_t *area = &xc->reduce_area[area_id];
+
+            *area = (xhc_reduce_area_t) {0};
+
+            int remaining = allreduce_count - el_idx;
+            int workers = avail_workers[area_id];
+
+            int elems_per_member;
+            int repeat = 0;
+
+            int area_elems = opal_min(max_elems * workers, remaining);
+
+            /* We should consider the future size of the next area. If it's
+             * too small in relation to the minimum chunk (min_elems), some
+             * workers of the next area won't perform work, leading to load
+             * imbalance. In this case, we elect to either shrink the current
+             * area so that we will be able to better balance the load in the
+             * next one, or if the elements that remain for the next area are
+             * especially few, we make this area absorb the next one.
+             * Specifically, we absorb it if the increase of each worker's
+             * load is no more than 10% of the maximum load set. */
+            if(uniform_chunks && area_id < MAX_REDUCE_AREAS(xc) - 1) {
+                int next_workers = avail_workers[area_id+1];
+                int next_remaining = allreduce_count - (el_idx + area_elems);
+
+                if(next_remaining < next_workers * min_elems) {
+                    if(next_remaining/workers <= max_elems/10) {
+                        area_elems += next_remaining;
+                    } else {
+                        int ideal_donate = next_workers * min_elems - next_remaining;
+
+                        /* Don't donate so much elements that this area
+                         * won't cover its own min reduction chunk size */
+                        int max_donate = area_elems - workers * min_elems;
+                        max_donate = (max_donate > 0 ? max_donate : 0);
+
+                        area_elems -= opal_min(ideal_donate, max_donate);
+                    }
+                }
+            }
+
+            if(uniform_chunks) {
+                /* The elements might not be enough for every worker to do
+                 * work. We calculate how many workers we need so that no
+                 * one of them does less than min_elems work, and use the
+                 * result to calculate the final elements per member. */
+                workers = opal_min(area_elems/min_elems, workers);
+                workers = opal_max(workers, 1);
+
+                elems_per_member = area_elems / workers;
+            } else {
+                elems_per_member = max_elems;
+                workers = area_elems/max_elems;
+            }
+
+            // If this is the middle area, try to maximize its size
+            if(area_id == 1 && workers > 0) {
+                int set = workers * elems_per_member;
+                repeat = (int)((remaining-area_elems)/set);
+                area_elems += repeat * set;
+            }
+
+            area->start = el_idx;
+            area->len = area_elems;
+            area->workers = workers;
+            area->stride = workers * elems_per_member;
+
+            /* My ID, assuming that if some member is not reducing, it is
+             * the one with ID=0, because currently only member 0 becomes
+             * the leader, and the leader is the only one that might not
+             * be reducing. */
+            int worker_id = xc->member_id - (xc->size - avail_workers[area_id]);
+
+            area->work_begin = el_idx + worker_id * elems_per_member;
+            area->work_chunk = (worker_id >= 0 && worker_id < workers ?
+                elems_per_member : 0);
+
+            area->work_leftover = 0;
+
+            int leftover_elems = (workers > 0 ?
+                (area_elems % (workers * elems_per_member)) : area_elems);
+            if(leftover_elems) {
+                if(worker_id == (uniform_chunks ? workers - 1 : workers)) {
+                    area->work_leftover = leftover_elems;
+                }
+            }
+
+            area->work_end = area->work_begin + (repeat * area->stride)
+                + area->work_chunk + area->work_leftover;
+
+            el_idx += area_elems;
+            area_id++;
+        }
+
+        assert(el_idx == allreduce_count);
+
+        xc->n_reduce_areas = area_id;
+
+        // Erase zero-work areas
+        while(xc->n_reduce_areas > 0
+                && xc->reduce_area[xc->n_reduce_areas - 1].work_chunk == 0
+                && xc->reduce_area[xc->n_reduce_areas - 1].work_leftover == 0) {
+            xc->n_reduce_areas--;
+        }
+
+        /* If not a leader on this comm, nothing
+         * to do on next ones whatsoever */
+        if(!xc->is_coll_leader) {
+            break;
+        }
+    }
+}
+
+static void xhc_allreduce_init_local(xhc_comm_t *comms, int comm_count,
+        int allreduce_count, size_t dtype_size, xf_sig_t seq) {
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        xc->is_coll_leader = false;
+
+        for(int m = 0; m < xc->size; m++) {
+            xc->member_info[m] = (xhc_member_info_t) {0};
+        }
+
+        xc->all_joined = false;
+    }
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        /* The manager is the leader. Even in the dynamic reduce case,
+         * there (currently) shouldn't be any real benefit from the
+         * leader being dynamic in allreduce. */
+        if(xc->member_id != 0) {
+            break;
+        }
+
+        xc->comm_ctrl->leader_seq = seq;
+        xc->is_coll_leader = true;
+    }
+
+    init_reduce_areas(comms, comm_count, allreduce_count, dtype_size);
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        int initial_count = (xc->n_reduce_areas > 0 ?
+            xc->reduce_area[0].work_begin : allreduce_count);
+
+        int m = 0;
+        OPAL_LIST_FOREACH_DECL(item, xc->reduce_queue, xhc_rq_item_t) {
+            if(m == xc->member_id) {
+                m++;
+            }
+
+            *item = (xhc_rq_item_t) {.super = item->super, .member = m++,
+                .count = initial_count, .area_id = 0};
+        }
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+    }
+}
+
+static void xhc_allreduce_init_comm(xhc_comm_t *comms, int comm_count,
+        void *rbuf, bool do_cico, int ompi_rank, xf_sig_t seq) {
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+
+        WAIT_FLAG(&xc->comm_ctrl->coll_ack, seq - 1, 0);
+
+        /* Because there is a control dependency with the load
+         * from coll_ack above and the code below, and because
+         * it is a load-store one (not load-load), I declare
+         * that a read-memory-barrier is not required here. */
+
+        xc->comm_ctrl->leader_id = xc->member_id;
+        xc->comm_ctrl->leader_rank = ompi_rank;
+        xc->comm_ctrl->data_vaddr = (!do_cico ? rbuf : NULL);
+        xc->comm_ctrl->bytes_ready = 0;
+
+        xhc_atomic_wmb();
+
+        xc->comm_ctrl->coll_seq = seq;
+    }
+}
+
+static void xhc_allreduce_init_member(xhc_comm_t *comms, int comm_count,
+        xhc_peer_info_t *peer_info, void *sbuf, void *rbuf, int allreduce_count,
+        bool do_cico, int ompi_rank, xf_sig_t seq) {
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        /* Essentially the value of reduce area-0's
+         * work_begin, as set in init_local() */
+        int rq_first_count = ((xhc_rq_item_t *)
+            opal_list_get_first(xc->reduce_queue))->count;
+
+        /* Make sure that the previous owner of my member ctrl (tip: can
+         * occur with dynamic leadership (or non-zero root!?), when it is
+         * implemented ^^) is not still using it. Also not that this
+         * previous owner will set member_ack only after the comm's coll_ack
+         * is set, so it also guarantees that no other member in the comm is
+         * accessing the member's flags from a previous collective. */
+        WAIT_FLAG(&xc->my_member_ctrl->member_ack, seq - 1, 0);
+
+        xc->my_member_ctrl->reduce_done = rq_first_count;
+        xc->my_member_ctrl->reduce_ready = (i == 0 && !do_cico ? allreduce_count : 0);
+
+        xc->my_member_ctrl->rank = ompi_rank;
+
+        if(!do_cico) {
+            xc->my_member_ctrl->sbuf_vaddr = (i == 0 ? sbuf : rbuf);
+            xc->my_member_ctrl->rbuf_vaddr = (xc->is_coll_leader ? rbuf : NULL);
+
+            xc->my_member_ctrl->cico_id = -1;
+
+            xc->my_member_info->sbuf = (i == 0 ? sbuf : rbuf);
+            xc->my_member_info->rbuf = rbuf;
+        } else {
+            xc->my_member_ctrl->sbuf_vaddr = NULL;
+            xc->my_member_ctrl->rbuf_vaddr = NULL;
+
+            int cico_id = (i == 0 ? ompi_rank : comms[i-1].manager_rank);
+            xc->my_member_ctrl->cico_id = cico_id;
+
+            xc->my_member_info->sbuf = xhc_get_cico(peer_info, cico_id);
+            xc->my_member_info->rbuf = xhc_get_cico(peer_info, ompi_rank);
+        }
+
+        xhc_atomic_wmb();
+        xc->my_member_ctrl->member_seq = seq;
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+    }
+}
+
+// -----------------------------
+
+static int xhc_allreduce_attach_member(xhc_comm_t *xc, int member,
+        xhc_peer_info_t *peer_info, size_t bytes, bool do_cico, xf_sig_t seq) {
+
+    if(xc->member_info[member].init) {
+        return 0;
+    }
+
+    if(!do_cico) {
+        int member_rank = xc->member_ctrl[member].rank;
+
+        void *sbuf_vaddr = xc->member_ctrl[member].sbuf_vaddr;
+        void *rbuf_vaddr = xc->member_ctrl[member].rbuf_vaddr;
+
+        xc->member_info[member].sbuf = xhc_get_registration(
+            &peer_info[member_rank], sbuf_vaddr, bytes,
+            &xc->member_info[member].sbuf_reg);
+
+        if(xc->member_info[member].sbuf == NULL) {
+            return -1;
+        }
+
+        // Leaders will also share their rbuf
+        if(rbuf_vaddr) {
+            if(rbuf_vaddr != sbuf_vaddr) {
+                xc->member_info[member].rbuf = xhc_get_registration(
+                    &peer_info[member_rank], rbuf_vaddr, bytes,
+                    &xc->member_info[member].rbuf_reg);
+
+                if(xc->member_info[member].rbuf == NULL) {
+                    return -1;
+                }
+            } else
+                xc->member_info[member].rbuf = xc->member_info[member].sbuf;
+        }
+    } else {
+        /* Here's the deal with CICO buffers and the comm's manager: In order
+         * to avoid excessive amounts of attachments, ranks that are
+         * foreign to a comm only attach to the comm's manager's CICO buffer,
+         * instead of to every member's. Therefore, members will place their
+         * final data in the manager's CICO buffer, instead of the leader's
+         * (even though the leader and the manager actually very often are one
+         * and the same..). */
+
+        xc->member_info[member].sbuf = xhc_get_cico(peer_info,
+            xc->member_ctrl[member].cico_id);
+
+        if(CHECK_FLAG(&xc->comm_ctrl->coll_seq, seq, 0)
+                && member == xc->comm_ctrl->leader_id) {
+            xc->member_info[member].rbuf = xhc_get_cico(peer_info, xc->manager_rank);
+        }
+    }
+
+    xc->member_info[member].init = true;
+
+    return 0;
+}
+
+static void xhc_allreduce_leader_check_all_joined(xhc_comm_t *xc, xf_sig_t seq) {
+    for(int m = 0; m < xc->size; m++) {
+        if(m == xc->member_id) {
+            continue;
+        }
+
+        if(!CHECK_FLAG(&xc->member_ctrl[m].member_seq, seq, 0)) {
+            return;
+        }
+    }
+
+    xc->all_joined = true;
+}
+
+static void xhc_allreduce_disconnect_peers(xhc_comm_t *comms, int comm_count) {
+    xhc_comm_t *xc = comms;
+
+    while(xc && xc->is_coll_leader) {
+        xc = (xc != &comms[comm_count-1] ? xc + 1 : NULL);
+    }
+
+    if(xc == NULL) {
+        return;
+    }
+
+    xhc_reg_t *reg;
+
+    for(int m = 0; m < xc->size; m++) {
+        if(m == xc->member_id) {
+            continue;
+        }
+
+        if((reg = xc->member_info[m].sbuf_reg)) {
+            xhc_return_registration(reg);
+        }
+
+        if((reg = xc->member_info[m].rbuf_reg)) {
+            xhc_return_registration(reg);
+        }
+    }
+}
+
+// -----------------------------
+
+static xhc_comm_t *xhc_allreduce_bcast_src_comm(xhc_comm_t *comms, int comm_count) {
+    xhc_comm_t *s = NULL;
+
+    for(int i = 0; i < comm_count; i++) {
+        if(!comms[i].is_coll_leader) {
+            s = &comms[i];
+            break;
+        }
+    }
+
+    return s;
+}
+
+static void xhc_allreduce_do_ack(xhc_comm_t *comms, int comm_count, xf_sig_t seq) {
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        xc->my_member_ctrl->member_ack = seq;
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+
+        for(int m = 0; m < xc->size; m++) {
+            if(m == xc->member_id) {
+                continue;
+            }
+
+            WAIT_FLAG(&xc->member_ctrl[m].member_ack, seq, OMPI_XHC_ACK_WIN);
+        }
+
+        xc->comm_ctrl->coll_ack = seq;
+    }
+}
+
+// -----------------------------
+
+static void xhc_allreduce_cico_publish(xhc_comm_t *xc, void *data_src,
+        xhc_peer_info_t *peer_info, int ompi_rank, int allreduce_count,
+        size_t dtype_size) {
+
+    int ready = xc->my_member_ctrl->reduce_ready;
+
+    /* The chunk size here is just a means of pipelining the CICO
+     * publishing, for whichever case this might be necessary in.
+     * There isn't really any reason to consult reduce areas and
+     * their chunk sizes here.*/
+    int elements = opal_min(xc->chunk_size/dtype_size, allreduce_count - ready);
+
+    void *src = (char *) data_src + ready * dtype_size;
+    void *dst = (char *) xhc_get_cico(peer_info, ompi_rank) + ready * dtype_size;
+
+    memcpy(dst, src, elements * dtype_size);
+    xhc_atomic_wmb();
+
+    volatile xf_int_t *rrp = &xc->my_member_ctrl->reduce_ready;
+    xhc_atomic_store_int(rrp, ready + elements);
+}
+
+static int xhc_allreduce_reduce_get_next(xhc_comm_t *xc,
+        xhc_peer_info_t *peer_info, int allreduce_count,
+        size_t dtype_size, bool do_cico, bool out_of_order_reduce,
+        xf_sig_t seq, xhc_rq_item_t **item_dst) {
+
+    xhc_rq_item_t *member_item = NULL;
+    int stalled_member = xc->size;
+
+    /* Iterate the reduce queue, to determine which member's data to reduce,
+     * and from what index. The reduction queue aids in the implementation of
+     * the rationale that members that are not ready at some point should be
+     * temporarily skipped, to prevent stalling in the collective. Reasons
+     * that a member may not be "ready" are (1) it has not yet joined the
+     * collective, (2) the necessary data have not yet been produced (eg.
+     * because the member's children have not finished their reduction on the
+     * previous communicator) or have not been copied to the CICO buffer.
+     * However, when floating point data is concerned, skipping members and
+     * therefore doing certain reductions in non-deterministic order results
+     * to reproducibility problems. Hence the existence of the "dynamic reduce"
+     * switch; when enabled, members are skipped when not ready. When disabled,
+     * members are skipped, but only the data of members with a lower ID that
+     * the one that has stalled can be reduced (eg. member 2 has stalled, but
+     * reduction for future chunks of members 0 and 1 (only, not of member 3,
+     * even if it is ready) will begin instead of completely stalling). The
+     * reduction queue is sorted according to the reduction progress counter in
+     * each entry. This helps ensure fully reduced chunks are generated as soon
+     * as possible, so that leaders can quickly propagate them upwards. */
+    OPAL_LIST_FOREACH_DECL(item, xc->reduce_queue, xhc_rq_item_t) {
+        int member = item->member;
+
+        if(!xc->member_info[member].init
+                && CHECK_FLAG(&xc->member_ctrl[member].member_seq, seq, 0)) {
+
+            xhc_atomic_rmb();
+
+            int ret = xhc_allreduce_attach_member(xc, member, peer_info,
+                allreduce_count * dtype_size, do_cico, seq);
+
+            if(ret != 0) {
+                return ret;
+            }
+        }
+
+        if(xc->member_info[member].init && item->count < allreduce_count) {
+            xhc_reduce_area_t *area = &xc->reduce_area[item->area_id];
+            int elements = area->work_chunk;
+
+            if(item->count + elements + area->work_leftover == area->work_end) {
+                elements += area->work_leftover;
+            }
+
+            int self_ready = xc->my_member_ctrl->reduce_ready;
+
+            volatile xf_int_t *rrp = &xc->member_ctrl[member].reduce_ready;
+            int member_ready = xhc_atomic_load_int(rrp);
+
+            if(self_ready >= item->count + elements
+                    && member_ready >= item->count + elements
+                    && member < stalled_member) {
+
+                member_item = item;
+                break;
+            }
+        }
+
+        if(!out_of_order_reduce) {
+            stalled_member = opal_min(stalled_member, member);
+        }
+    }
+
+    if(member_item) {
+        opal_list_remove_item(xc->reduce_queue, (opal_list_item_t *) member_item);
+    }
+
+    *item_dst = member_item;
+
+    return 0;
+}
+
+static void xhc_allreduce_rq_item_analyze(xhc_comm_t *xc, xhc_rq_item_t *item,
+        bool *first_reduction, bool *last_reduction) {
+
+    *first_reduction = false;
+    *last_reduction = false;
+
+    if(opal_list_get_size(xc->reduce_queue) == 0) {
+        *first_reduction = true;
+        *last_reduction = true;
+    } else {
+        xhc_rq_item_t *first_item = (xhc_rq_item_t *)
+            opal_list_get_first(xc->reduce_queue);
+
+        xhc_rq_item_t *last_item = (xhc_rq_item_t *)
+            opal_list_get_last(xc->reduce_queue);
+
+        /* If this count is equal or larger than the last one, it means that
+         * no other count in the queue is larger than it. Therefore, this is the
+         * first reduction taking place for the "member_item->count" chunk idx. */
+        if(item->count >= last_item->count) {
+            *first_reduction = true;
+        }
+
+        /* If this count is uniquely minimum in the queue, this is the
+         * last reduction taking place for this specific chunk index. */
+        if(item->count < first_item->count) {
+            *last_reduction = true;
+        }
+    }
+}
+
+static void xhc_allreduce_do_reduce(xhc_comm_t *xc, xhc_rq_item_t *member_item,
+        int allreduce_count, ompi_datatype_t *dtype, size_t dtype_size,
+        ompi_op_t *op) {
+
+    xhc_reduce_area_t *area = &xc->reduce_area[member_item->area_id];
+    int elements = area->work_chunk;
+
+    if(member_item->count + elements + area->work_leftover == area->work_end) {
+        elements += area->work_leftover;
+    }
+
+    size_t offset = member_item->count * dtype_size;
+
+    char *src = (char *) xc->member_info[member_item->member].sbuf + offset;
+
+    char *dst;
+    char *src2 = NULL;
+
+    bool first_reduction, last_reduction;
+
+    xhc_allreduce_rq_item_analyze(xc, member_item,
+        &first_reduction, &last_reduction);
+
+    /* Only access comm_ctrl when it's the last reduction. Otherwise,
+     * it's not guaranteed that the leader will have initialized it yet.*/
+    if(last_reduction) {
+        dst = (char *) xc->member_info[xc->comm_ctrl->leader_id].rbuf + offset;
+    } else {
+        dst = (char *) xc->my_member_info->rbuf + offset;
+    }
+
+    if(first_reduction) {
+        src2 = (char *) xc->my_member_info->sbuf + offset;
+    } else if(last_reduction) {
+        src2 = (char *) xc->my_member_info->rbuf + offset;
+    }
+
+    // Happens under certain circumstances with MPI_IN_PLACE or with CICO
+    if(src2 == dst) {
+        src2 = NULL;
+    } else if(src == dst) {
+        src = src2;
+        src2 = NULL;
+    }
+
+    xhc_atomic_rmb();
+
+    if(src2) {
+        ompi_3buff_op_reduce(op, src2, src, dst, elements, dtype);
+    } else {
+        ompi_op_reduce(op, src, dst, elements, dtype);
+    }
+
+    /* If we reached the end of the area after this reduction, switch
+     * to the next one, or mark completion if it was the last one.
+     * Otherwise, adjust the count according to the area's parameters. */
+    if(member_item->count + elements == area->work_end) {
+        if(member_item->area_id < xc->n_reduce_areas - 1) {
+            member_item->area_id++;
+            member_item->count = xc->reduce_area[member_item->area_id].work_begin;
+        } else {
+            member_item->count = allreduce_count;
+        }
+    } else {
+        member_item->count += area->stride;
+    }
+}
+
+static void xhc_allreduce_reduce_return_item(xhc_comm_t *xc,
+        xhc_rq_item_t *member_item) {
+
+    bool placed = false;
+
+    xhc_rq_item_t *item;
+    OPAL_LIST_FOREACH_REV(item, xc->reduce_queue, xhc_rq_item_t) {
+        if(member_item->count >= item->count) {
+            opal_list_insert_pos(xc->reduce_queue,
+                (opal_list_item_t *) item->super.opal_list_next,
+                (opal_list_item_t *) member_item);
+
+            placed = true;
+            break;
+        }
+    }
+
+    if(!placed) {
+        opal_list_prepend(xc->reduce_queue, (opal_list_item_t *) member_item);
+    }
+
+    xhc_rq_item_t *first_item = (xhc_rq_item_t *)
+        opal_list_get_first(xc->reduce_queue);
+
+    if(first_item->count > xc->my_member_ctrl->reduce_done) {
+        xhc_atomic_wmb();
+
+        volatile xf_int_t *rdp = &xc->my_member_ctrl->reduce_done;
+        xhc_atomic_store_int(rdp, first_item->count);
+    }
+}
+
+static void xhc_allreduce_do_bcast(xhc_comm_t *comms, int comm_count,
+        xhc_comm_t *src_comm, size_t bytes_total, size_t *bcast_done,
+        const void *bcast_src, void *bcast_dst, void *bcast_cico) {
+
+    size_t copy_size = opal_min(src_comm->chunk_size, bytes_total - *bcast_done);
+
+    volatile xf_size_t *brp = &src_comm->comm_ctrl->bytes_ready;
+
+    if(xhc_atomic_load_size_t(brp) - *bcast_done >= copy_size) {
+        void *src = (char *) bcast_src + *bcast_done;
+        void *dst = (char *) bcast_dst + *bcast_done;
+        void *cico_dst = (char *) bcast_cico + *bcast_done;
+
+        xhc_atomic_rmb();
+
+        if(bcast_cico && comms[0].is_coll_leader) {
+            memcpy(cico_dst, src, copy_size);
+        } else {
+            memcpy(dst, src, copy_size);
+        }
+
+        *bcast_done += copy_size;
+
+        xhc_atomic_wmb();
+
+        for(int i = 0; i < comm_count; i++) {
+            if(!comms[i].is_coll_leader) {
+                break;
+            }
+
+            volatile xf_size_t *brp_d = &comms[i].comm_ctrl->bytes_ready;
+            xhc_atomic_store_size_t(brp_d, *bcast_done);
+        }
+
+        if(bcast_cico && comms[0].is_coll_leader) {
+            memcpy(dst, cico_dst, copy_size);
+        }
+    }
+}
+
+// -----------------------------
+
+int mca_coll_xhc_allreduce_internal(const void *sbuf, void *rbuf, int count,
+        ompi_datatype_t *datatype, ompi_op_t *op, ompi_communicator_t *ompi_comm,
+        mca_coll_base_module_t *ompi_module, bool require_bcast) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    if(!module->init) {
+        int ret = xhc_lazy_init(module, ompi_comm);
+        if(ret != OMPI_SUCCESS) {
+            return ret;
+        }
+    }
+
+    if(!ompi_datatype_is_predefined(datatype)) {
+        static bool warn_shown = false;
+
+        if(!warn_shown) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: XHC does not currently support "
+                "derived datatypes; utilizing fallback component");
+            warn_shown = true;
+        }
+
+        xhc_coll_fns_t fallback = module->prev_colls;
+
+        if(require_bcast) {
+            return fallback.coll_allreduce(sbuf, rbuf, count, datatype,
+                op, ompi_comm, fallback.coll_allreduce_module);
+        } else {
+            return fallback.coll_reduce(sbuf, rbuf, count, datatype,
+                op, 0, ompi_comm, fallback.coll_reduce_module);
+        }
+    }
+
+    if(!ompi_op_is_commute(op)) {
+        static bool warn_shown = false;
+
+        if(!warn_shown) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: (all)reduce does not support non-commutative "
+                "operators; utilizing fallback component");
+            warn_shown = true;
+        }
+
+        xhc_coll_fns_t fallback = module->prev_colls;
+
+        if(require_bcast) {
+            return fallback.coll_allreduce(sbuf, rbuf, count, datatype,
+                op, ompi_comm, fallback.coll_allreduce_module);
+        } else {
+            return fallback.coll_reduce(sbuf, rbuf, count, datatype,
+                op, 0, ompi_comm, fallback.coll_reduce_module);
+        }
+    }
+
+    // ----
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    xhc_data_t *data = module->data;
+
+    xhc_comm_t *comms = data->comms;
+    int comm_count = data->comm_count;
+
+    size_t dtype_size, bytes_total;
+    ompi_datatype_type_size(datatype, &dtype_size);
+    bytes_total = count * dtype_size;
+
+    bool do_cico = (bytes_total <= OMPI_XHC_CICO_MAX);
+    bool out_of_order_reduce = false;
+
+    int rank = ompi_comm_rank(ompi_comm);
+
+    // ----
+
+    switch(mca_coll_xhc_component.dynamic_reduce) {
+        case OMPI_XHC_DYNAMIC_REDUCE_DISABLED:
+            out_of_order_reduce = false;
+            break;
+
+        case OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT:
+            out_of_order_reduce = !(datatype->super.flags & OMPI_DATATYPE_FLAG_DATA_FLOAT);
+            break;
+
+        case OMPI_XHC_DYNAMIC_REDUCE_ALL:
+            out_of_order_reduce = true;
+            break;
+    }
+
+    // ----
+
+    // rbuf won't be present for non-root ranks in MPI_Reduce
+    if(rbuf == NULL && !do_cico) {
+        if(module->rbuf_size < bytes_total) {
+            void *tmp = realloc(module->rbuf, bytes_total);
+
+            if(tmp != NULL) {
+                module->rbuf = tmp;
+                module->rbuf_size = bytes_total;
+            } else {
+                return OPAL_ERR_OUT_OF_RESOURCE;
+            }
+        }
+
+        rbuf = module->rbuf;
+    }
+
+    // ----
+
+    xf_sig_t pvt_seq = ++data->pvt_coll_seq;
+
+    if(sbuf == MPI_IN_PLACE) {
+        sbuf = rbuf;
+    }
+
+    xhc_allreduce_init_local(comms, comm_count, count, dtype_size, pvt_seq);
+    xhc_allreduce_init_comm(comms, comm_count, rbuf, do_cico, rank, pvt_seq);
+    xhc_allreduce_init_member(comms, comm_count, peer_info,
+        (void *) sbuf, rbuf, count, do_cico, rank, pvt_seq);
+
+    void *local_cico = xhc_get_cico(peer_info, comms[0].manager_rank);
+
+    // My conscience is clear!
+    if(require_bcast) {
+        goto _allreduce;
+    } else {
+        goto _reduce;
+    }
+
+// =============================================================================
+
+_allreduce: {
+
+    xhc_comm_t *bcast_comm =
+        xhc_allreduce_bcast_src_comm(comms, comm_count);
+
+    bool bcast_leader_joined = false;
+
+    for(size_t bytes_done = 0; bytes_done < bytes_total; ) {
+        for(int i = 0; i < comm_count; i++) {
+            xhc_comm_t *xc = &comms[i];
+            xhc_comm_t *xnc = (i < comm_count - 1 ? &comms[i+1] : NULL);
+
+            if(do_cico && i == 0 && xc->my_member_ctrl->reduce_ready < count) {
+                xhc_allreduce_cico_publish(xc, (void *) sbuf,
+                    peer_info, rank, count, dtype_size);
+            }
+
+            if(xc->is_coll_leader) {
+                int completed = 0;
+
+                if(!xc->all_joined) {
+                    xhc_allreduce_leader_check_all_joined(xc, pvt_seq);
+                }
+
+                if(xc->all_joined) {
+                    completed = count;
+
+                    for(int m = 0; m < xc->size; m++) {
+                        volatile xf_int_t *rdp = &xc->member_ctrl[m].reduce_done;
+                        int member_done = xhc_atomic_load_int(rdp);
+
+                        /* Watch out for double evaluation here, don't perform
+                         * sensitive loads inside opal_min()'s parameter list. */
+                        completed = opal_min(completed, member_done);
+                    }
+                }
+
+                if(xnc && completed > xnc->my_member_ctrl->reduce_ready) {
+                    volatile xf_int_t *rrp = &xnc->my_member_ctrl->reduce_ready;
+                    xhc_atomic_store_int(rrp, completed);
+                } else if(!xnc) {
+                    size_t bytes_fully_reduced = completed * dtype_size;
+
+                    // Broadcast fully reduced data
+                    if(bytes_fully_reduced > bytes_done) {
+                        for(int k = 0; k < comm_count; k++) {
+                            volatile xf_size_t *brp =
+                                &comms[k].comm_ctrl->bytes_ready;
+                            xhc_atomic_store_size_t(brp, bytes_fully_reduced);
+                        }
+
+                        if(do_cico) {
+                            void *src = (char *) local_cico + bytes_done;
+                            void *dst = (char *) rbuf + bytes_done;
+                            memcpy(dst, src, bytes_fully_reduced - bytes_done);
+                        }
+
+                        bytes_done = bytes_fully_reduced;
+                    }
+                }
+            }
+
+            // Is the reduction phase completed?
+            if(xc->my_member_ctrl->reduce_done < count) {
+                xhc_rq_item_t *member_item = NULL;
+
+                int ret = xhc_allreduce_reduce_get_next(xc,
+                    peer_info, count, dtype_size, do_cico,
+                    out_of_order_reduce, pvt_seq, &member_item);
+
+                if(ret != 0) {
+                    return OMPI_ERROR;
+                }
+
+                if(member_item) {
+                    xhc_allreduce_do_reduce(xc, member_item,
+                        count, datatype, dtype_size, op);
+
+                    xhc_allreduce_reduce_return_item(xc, member_item);
+                }
+            }
+
+            /* If not a leader in this comm, not
+             * participating in higher-up ones. */
+            if(!xc->is_coll_leader) {
+                break;
+            }
+        }
+
+        if(bcast_comm && !bcast_leader_joined) {
+            if(CHECK_FLAG(&bcast_comm->comm_ctrl->coll_seq, pvt_seq, 0)) {
+                xhc_atomic_rmb();
+
+                int leader = bcast_comm->comm_ctrl->leader_id;
+
+                if(!bcast_comm->member_info[leader].init) {
+                    WAIT_FLAG(&bcast_comm->member_ctrl[leader].member_seq,
+                        pvt_seq, 0);
+
+                    xhc_atomic_rmb();
+
+                    xhc_allreduce_attach_member(bcast_comm, leader,
+                        peer_info, bytes_total, do_cico, pvt_seq);
+                }
+
+                bcast_leader_joined = true;
+            }
+        }
+
+        if(bcast_comm && bcast_leader_joined) {
+            int leader = bcast_comm->comm_ctrl->leader_id;
+
+            xhc_allreduce_do_bcast(comms, comm_count,
+                bcast_comm, bytes_total, &bytes_done,
+                bcast_comm->member_info[leader].rbuf,
+                rbuf, (do_cico ? local_cico : NULL));
+        }
+    }
+
+    xhc_allreduce_do_ack(comms, comm_count, pvt_seq);
+
+    goto _finish;
+}
+
+// =============================================================================
+
+_reduce: {
+
+    size_t cico_copied = 0;
+    int completed_comms = 0;
+
+    while(completed_comms < comm_count) {
+        for(int i = completed_comms; i < comm_count; i++) {
+            xhc_comm_t *xc = &comms[i];
+            xhc_comm_t *xnc = (i < comm_count - 1 ? &comms[i+1] : NULL);
+
+            if(do_cico && i == 0 && xc->my_member_ctrl->reduce_ready < count) {
+                xhc_allreduce_cico_publish(xc, (void *) sbuf,
+                    peer_info, rank, count, dtype_size);
+            }
+
+            if(xc->is_coll_leader) {
+                int completed = 0;
+
+                if(!xc->all_joined) {
+                    xhc_allreduce_leader_check_all_joined(xc, pvt_seq);
+                }
+
+                if(xc->all_joined) {
+                    completed = count;
+
+                    for(int m = 0; m < xc->size; m++) {
+                        volatile xf_int_t *rdp = &xc->member_ctrl[m].reduce_done;
+                        int member_done = xhc_atomic_load_int(rdp);
+
+                        /* Watch out for double evaluation here, don't perform
+                         * sensitive loads inside opal_min()'s parameter list. */
+                        completed = opal_min(completed, member_done);
+                    }
+                }
+
+                if(xnc && completed > xnc->my_member_ctrl->reduce_ready) {
+                    volatile xf_int_t *rrp = &xnc->my_member_ctrl->reduce_ready;
+                    xhc_atomic_store_int(rrp, completed);
+                } else if(!xnc) {
+                    size_t completed_bytes = completed * dtype_size;
+
+                    if(do_cico && completed_bytes > cico_copied) {
+                        void *src = (char *) local_cico + cico_copied;
+                        void *dst = (char *) rbuf + cico_copied;
+
+                        memcpy(dst, src, completed_bytes - cico_copied);
+                        cico_copied = completed_bytes;
+                    }
+                }
+
+                if(completed >= count) {
+                    xc->comm_ctrl->coll_ack = pvt_seq;
+                    completed_comms++;
+                }
+            }
+
+            // Is the reduction phase completed?
+            if(xc->my_member_ctrl->reduce_done < count) {
+                xhc_rq_item_t *member_item = NULL;
+
+                int ret = xhc_allreduce_reduce_get_next(xc,
+                    peer_info, count, dtype_size, do_cico,
+                    out_of_order_reduce, pvt_seq, &member_item);
+
+                if(ret != 0) {
+                    return OMPI_ERROR;
+                }
+
+                if(member_item) {
+                    xhc_allreduce_do_reduce(xc, member_item,
+                        count, datatype, dtype_size, op);
+
+                    xhc_allreduce_reduce_return_item(xc, member_item);
+                }
+            }
+
+            if(!xc->is_coll_leader) {
+                /* If all reduction-related tasks are done, and
+                 * not a leader on the next comm, can exit */
+                if(xc->my_member_ctrl->reduce_done >= count
+                        && xc->my_member_ctrl->reduce_ready >= count) {
+                    goto _reduce_done;
+                }
+
+                /* Not a leader in this comm, so not
+                 * participating in higher-up ones. */
+                break;
+            }
+        }
+    }
+
+    _reduce_done:
+
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        /* Wait for the leader to give the signal that reduction
+         * has finished on this comm and members are free to exit */
+        if(!xc->is_coll_leader) {
+            WAIT_FLAG(&xc->comm_ctrl->coll_ack, pvt_seq, OMPI_XHC_ACK_WIN);
+        }
+
+        // load-store control dependency with coll_ack; no need for barrier
+        xc->my_member_ctrl->member_ack = pvt_seq;
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+    }
+
+    goto _finish;
+}
+
+// =============================================================================
+
+_finish:
+
+    if(!do_cico) {
+        xhc_allreduce_disconnect_peers(comms, comm_count);
+    }
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_xhc_allreduce(const void *sbuf, void *rbuf,
+        int count, ompi_datatype_t *datatype, ompi_op_t *op,
+        ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) {
+
+    return xhc_allreduce_internal(sbuf, rbuf,
+        count, datatype, op, ompi_comm, ompi_module, true);
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_atomic.h b/ompi/mca/coll/xhc/coll_xhc_atomic.h
new file mode 100644
index 00000000000..79f1dce98cb
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_atomic.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef MCA_COLL_XHC_ATOMIC_EXPORT_H
+#define MCA_COLL_XHC_ATOMIC_EXPORT_H
+
+#include <stdint.h>
+#include "opal/sys/atomic.h"
+
+// ----------------------------------------
+
+#define IS_SIG_ATOMIC_X_BITS(x) \
+    (SIG_ATOMIC_MAX == INT ## x ## _MAX) || (SIG_ATOMIC_MAX == UINT ## x ## _MAX)
+
+// ----------------------------------------
+
+// If xf_sig_t is ever re-defined to be signed,
+ // CHECK_FLAGS()'s comparisons must be adjusted
+#if IS_SIG_ATOMIC_X_BITS(64)
+    typedef uint64_t xf_sig_t;
+#elif IS_SIG_ATOMIC_X_BITS(32)
+    typedef uint32_t xf_sig_t;
+#elif IS_SIG_ATOMIC_X_BITS(16)
+    typedef uint16_t xf_sig_t;
+#elif IS_SIG_ATOMIC_X_BITS(8)
+    typedef uint8_t xf_sig_t;
+#endif
+
+typedef int __attribute__((aligned(SIZEOF_INT))) xf_int_t;
+typedef size_t __attribute__((aligned(SIZEOF_SIZE_T))) xf_size_t;
+
+// ----------------------------------------
+
+#define xhc_atomic_rmb opal_atomic_rmb
+#define xhc_atomic_wmb opal_atomic_wmb
+#define xhc_atomic_fmb opal_atomic_mb
+
+// https://github.com/open-mpi/ompi/issues/9722
+
+#if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS
+    #define xhc_atomic_load_int(addr) __atomic_load_n(addr, __ATOMIC_RELAXED)
+    #define xhc_atomic_store_int(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED)
+
+    #define xhc_atomic_load_size_t(addr) __atomic_load_n(addr, __ATOMIC_RELAXED)
+    #define xhc_atomic_store_size_t(addr, val) __atomic_store_n(addr, val, __ATOMIC_RELAXED)
+#else
+    #define xhc_atomic_load_int(addr) (*(addr))
+    #define xhc_atomic_store_int(addr, val) (*(addr) = (val))
+
+    #define xhc_atomic_load_size_t(addr) (*(addr))
+    #define xhc_atomic_store_size_t(addr, val) (*(addr) = (val))
+
+    #warning "GCC or the C11 atomics backend was not found. XHC might not function correctly"
+/* #else
+    #error "XHC atomics do not yet work without the GCC or the C11 backend" */
+#endif
+
+
+// If/when opal atomic load/store size_t is added
+
+/* #define xhc_atomic_load_size_t(addr) \
+    opal_atomic_load_size_t ((opal_atomic_size_t *) addr)
+#define xhc_atomic_store_size_t(addr, val) \
+    opal_atomic_store_size_t ((opal_atomic_size_t *) addr, val) */
+
+
+// If/when opal atomic load/store is added, and if opal atomic load/store int is not
+
+/* #if SIZEOF_INT == 4
+    #define xhc_atomic_load_int(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr)
+    #define xhc_atomic_store_int(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val)
+#elif SIZEOF_INT == 8
+    #define xhc_atomic_load_int(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr)
+    #define xhc_atomic_store_int(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val)
+#else
+    #error "Unsupported int size"
+#endif */
+
+
+// If/when opal atomic load/store is added, and if opal atomic load/store size_t is not
+
+/* #if SIZEOF_SIZE_T == 4
+    #define xhc_atomic_load_size_t(addr) opal_atomic_load_32 ((opal_atomic_int32_t *) addr)
+    #define xhc_atomic_store_size_t(addr, val) opal_atomic_store_32 ((opal_atomic_int32_t *) addr, val)
+#elif SIZEOF_SIZE_T == 8
+    #define xhc_atomic_load_size_t(addr) opal_atomic_load_64 ((opal_atomic_int64_t *) addr)
+    #define xhc_atomic_store_size_t(addr, val) opal_atomic_store_64 ((opal_atomic_int64_t *) addr, val)
+#else
+    #error "Unsupported size_t size"
+#endif */
+
+static inline bool xhc_atomic_cmpxchg_strong_relaxed(volatile xf_sig_t *addr,
+        xf_sig_t *oldval, xf_sig_t newval) {
+
+    #if OPAL_USE_GCC_BUILTIN_ATOMICS || OPAL_USE_C11_ATOMICS
+        return __atomic_compare_exchange_n(addr, oldval, newval,
+            false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    #else
+        #if IS_SIG_ATOMIC_X_BITS(32)
+            return opal_atomic_compare_exchange_strong_32(addr, oldval, newval);
+        #elif IS_SIG_ATOMIC_X_BITS(64)
+            return opal_atomic_compare_exchange_strong_64(addr, oldval, newval);
+        #else
+            #error "Unsupported sig_atomic_t size"
+        #endif
+    #endif
+}
+
+#endif
diff --git a/ompi/mca/coll/xhc/coll_xhc_barrier.c b/ompi/mca/coll/xhc/coll_xhc_barrier.c
new file mode 100644
index 00000000000..ade1300134a
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_barrier.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/constants.h"
+#include "ompi/communicator/communicator.h"
+
+#include "coll_xhc.h"
+
+static void xhc_barrier_leader(xhc_comm_t *comms, int comm_count,
+        xhc_peer_info_t *peer_info, int rank, int root, xf_sig_t seq) {
+
+    // Non-leader by default
+    for(int i = 0; i < comm_count; i++) {
+        comms[i].is_coll_leader = false;
+    }
+
+    for(int i = 0; i < comm_count; i++) {
+        // I'm the root and therefore always a leader
+        if(rank == root) {
+            comms[i].comm_ctrl->leader_seq = seq;
+            comms[i].is_coll_leader = true;
+
+            continue;
+        }
+
+        // The root takes leadership precedence when local
+        if(PEER_IS_LOCAL(peer_info, root, comms[i].locality)) {
+            break;
+        }
+
+        // The member with the lowest ID (ie. the manager) becomes the leader
+        if(comms[i].member_id == 0) {
+            comms[i].comm_ctrl->leader_seq = seq;
+            comms[i].is_coll_leader = true;
+        }
+
+        // Non-leaders exit; they can't become leaders on higher levels
+        if(comms[i].is_coll_leader == false) {
+            break;
+        }
+    }
+}
+
+/* Hierarchical Barrier with seq/ack flags
+ * ---------------------------------------
+ * 1. Ranks write their coll_seq field to signal they have joined
+ *    the collective. Leaders propagate this information towards
+ *    the top-most comm's leader using the same method.
+ *
+ * 2. The top-most comm's leader (root) sets the comm's coll_ack
+ *    field to signal, that all ranks have joined the barrier.
+ *
+ * 3. Leaders propagate the info towards the bottom-most comm, using
+ *    the same method. Ranks wait on thei coll_ack flag, set their
+ *    own ack, and exit the collective.
+ * --------------------------------------- */
+int mca_coll_xhc_barrier(ompi_communicator_t *ompi_comm,
+        mca_coll_base_module_t *ompi_module) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    if(!module->init) {
+        int ret = xhc_lazy_init(module, ompi_comm);
+        if(ret != OMPI_SUCCESS) return ret;
+    }
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    xhc_data_t *data = module->data;
+
+    xhc_comm_t *comms = data->comms;
+    int comm_count = data->comm_count;
+
+    int rank = ompi_comm_rank(ompi_comm);
+
+    xf_sig_t pvt_seq = ++data->pvt_coll_seq;
+
+    xhc_barrier_leader(comms, comm_count, peer_info, rank,
+        mca_coll_xhc_component.barrier_root, pvt_seq);
+
+    // 1. Upwards SEQ Wave
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        xc->my_member_ctrl->member_seq = pvt_seq;
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+
+        for(int m = 0; m < xc->size; m++) {
+            if(m == xc->member_id) {
+                continue;
+            }
+
+            /* Poll comm members and wait for them to join the barrier.
+             * No need for windowed comparison here; Ranks won't exit the
+             * barrier before the leader has set the coll_seq flag. */
+            WAIT_FLAG(&xc->member_ctrl[m].member_seq, pvt_seq, 0);
+        }
+    }
+
+    // 2. Wait for ACK (root won't wait!)
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(xc->is_coll_leader == false) {
+            WAIT_FLAG(&xc->comm_ctrl->coll_ack, pvt_seq, 0);
+            break;
+        }
+    }
+
+    // 3. Trigger ACK Wave
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        /* Not actually necessary for the barrier operation, but
+         * good for consistency between all seq/ack numbers */
+        xc->my_member_ctrl->member_ack = pvt_seq;
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+
+        xc->comm_ctrl->coll_ack = pvt_seq;
+    }
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_bcast.c b/ompi/mca/coll/xhc/coll_xhc_bcast.c
new file mode 100644
index 00000000000..f0b99983e50
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_bcast.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+#include "opal/util/show_help.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+/* When dynamic leadership is enabled, the first rank of each
+ * xhc comm to join the collective will become its leader */
+static void xhc_bcast_try_leader(xhc_comm_t *comms, int comm_count,
+        xhc_peer_info_t *peer_info, int rank, int root, xf_sig_t seq) {
+
+    // Non-leader by default
+    for(int i = 0; i < comm_count; i++) {
+        comms[i].is_coll_leader = false;
+    }
+
+    for(int i = 0; i < comm_count; i++) {
+        // I'm the root and therefore always a leader
+        if(rank == root) {
+            comms[i].comm_ctrl->leader_seq = seq;
+            comms[i].is_coll_leader = true;
+
+            continue;
+        }
+
+        // The root takes leadership precedence when local
+        if(PEER_IS_LOCAL(peer_info, root, comms[i].locality)) {
+            break;
+        }
+
+        if(mca_coll_xhc_component.dynamic_leader == false) {
+            /* If dynamic leadership is disabled, the member with
+             * the lowest ID (ie. the manager) becomes the leader */
+            if(comms[i].member_id == 0) {
+                comms[i].comm_ctrl->leader_seq = seq;
+                comms[i].is_coll_leader = true;
+            }
+        } else {
+            // An opportunity exists to become the leader
+            if(comms[i].comm_ctrl->leader_seq != seq) {
+                xf_sig_t oldval = seq - 1;
+
+                comms[i].is_coll_leader = xhc_atomic_cmpxchg_strong_relaxed(
+                    &comms[i].comm_ctrl->leader_seq, &oldval, seq);
+            }
+        }
+
+        // Non-leaders exit; they can't become leaders on higher levels
+        if(comms[i].is_coll_leader == false) {
+            break;
+        }
+    }
+
+    /* The writes and the cmpxchg to comm_ctrl->leader_seq, are relaxed.
+     * They do not synchronize access to any other data, and it's not a
+     * problem if some closeby loads/stores are reordered with it. The
+     * only purpose of leader_seq is to determine if a rank will be leader
+     * or not. Only the result of the cmp operation is utilized. */
+}
+
+static void xhc_bcast_children_init(xhc_comm_t *comms, int comm_count,
+        void *buffer, size_t bytes_ready, xhc_copy_data_t *region_data,
+        bool do_cico, int rank, xf_sig_t seq) {
+
+    for(int i = comm_count - 1; i >= 0; i--) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(!xc->is_coll_leader) {
+            continue;
+        }
+
+        WAIT_FLAG(&xc->comm_ctrl->coll_ack, seq - 1, 0);
+
+        /* Because there is a control dependency with the loads
+         * from coll_ack above and the code below, and because it
+         * is a load-store one (not load-load), I declare that a
+         * read-memory-barrier is not required here. */
+
+        xc->comm_ctrl->leader_id = xc->member_id;
+        xc->comm_ctrl->leader_rank = rank;
+
+        xc->comm_ctrl->cico_id = (do_cico ? comms[0].manager_rank : -1);
+
+        xc->comm_ctrl->data_vaddr = (!do_cico ? buffer : NULL);
+        xc->comm_ctrl->bytes_ready = bytes_ready;
+
+        if(region_data != NULL) {
+            xhc_copy_region_post(xc->comm_ctrl->access_token, region_data);
+        }
+
+        /* The above comm_ctrl stores must have finished before the
+         * peers are notified to attach/copy. We don't need an atomic
+         * store to bytes_ready here, since it is guarded by coll_seq. */
+        xhc_atomic_wmb();
+
+        xc->comm_ctrl->coll_seq = seq;
+    }
+}
+
+static void xhc_bcast_children_set_bytes_ready(xhc_comm_t *comms,
+        int comm_count, size_t bytes) {
+
+    for(int i = comm_count - 1; i >= 0; i--) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(!xc->is_coll_leader) {
+            continue;
+        }
+
+        volatile xf_size_t *brp = &xc->comm_ctrl->bytes_ready;
+        xhc_atomic_store_size_t(brp, bytes);
+    }
+
+    /* Not much reason for a wmb() here or inside the loop.
+     * The stores may be reordered after any following stores,
+     * and within themselves. */
+}
+
+static void xhc_bcast_do_ack(xhc_comm_t *comms,
+        int comm_count, xf_sig_t seq) {
+
+    // Set Ack(s)
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        xc->my_member_ctrl->member_ack = seq;
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+    }
+
+    // Gather members' Ack(s) and set coll_ack
+    for(int i = 0; i < comm_count; i++) {
+        xhc_comm_t *xc = &comms[i];
+
+        if(!xc->is_coll_leader) {
+            break;
+        }
+
+        for(int m = 0; m < xc->size; m++) {
+            if(m == xc->member_id) {
+                continue;
+            }
+
+            WAIT_FLAG(&xc->member_ctrl[m].member_ack, seq, OMPI_XHC_ACK_WIN);
+        }
+
+        xc->comm_ctrl->coll_ack = seq;
+    }
+}
+
+static xhc_comm_t *xhc_bcast_src_comm(xhc_comm_t *comms, int comm_count) {
+    xhc_comm_t *s = NULL;
+
+    for(int i = 0; i < comm_count; i++) {
+        if(!comms[i].is_coll_leader) {
+            s = &comms[i];
+            break;
+        }
+    }
+
+    return s;
+}
+
+int mca_coll_xhc_bcast(void *buf, int count, ompi_datatype_t *datatype, int root,
+        ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    if(!module->init) {
+        int ret = xhc_lazy_init(module, ompi_comm);
+        if(ret != OMPI_SUCCESS) return ret;
+    }
+
+    if(!ompi_datatype_is_predefined(datatype)) {
+        static bool warn_shown = false;
+
+        if(!warn_shown) {
+            opal_output_verbose(MCA_BASE_VERBOSE_WARN,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc: Warning: XHC does not currently support "
+                "derived datatypes; utilizing fallback component");
+            warn_shown = true;
+        }
+
+        xhc_coll_fns_t fallback = ((xhc_module_t *) module)->prev_colls;
+        return fallback.coll_bcast(buf, count, datatype, root,
+            ompi_comm, fallback.coll_bcast_module);
+    }
+
+    // ----
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    xhc_data_t *data = module->data;
+
+    xhc_comm_t *comms = data->comms;
+    int comm_count = data->comm_count;
+
+    size_t dtype_size, bytes_total;
+    ompi_datatype_type_size(datatype, &dtype_size);
+    bytes_total = count * dtype_size;
+
+    int rank = ompi_comm_rank(ompi_comm);
+
+    bool do_cico = (bytes_total <= OMPI_XHC_CICO_MAX);
+    void *local_cico = xhc_get_cico(peer_info, comms[0].manager_rank);
+    void *src_buffer;
+
+    // Only really necessary for smsc/knem
+    xhc_copy_data_t *region_data = NULL;
+
+    // ----
+
+    xf_sig_t pvt_seq = ++data->pvt_coll_seq;
+
+    xhc_bcast_try_leader(comms, comm_count, peer_info, rank, root, pvt_seq);
+
+    // No chunking for now... TODO?
+    if(rank == root && do_cico) {
+        memcpy(local_cico, buf, bytes_total);
+    }
+
+    if(!do_cico) {
+        int err = xhc_copy_expose_region(buf, bytes_total, &region_data);
+        if(err != 0) {
+            return OMPI_ERROR;
+        }
+    }
+
+    xhc_bcast_children_init(comms, comm_count, buf,
+        (rank == root ? bytes_total : 0), region_data, do_cico, rank, pvt_seq);
+
+    if(rank == root) {
+        goto coll_finish;
+    }
+
+    // ----
+
+    /* Not actually necessary for the broadcast operation, but
+     * good for consistency between all seq/ack numbers */
+    for(int i = 0; i < comm_count; i++) {
+        comms[i].my_member_ctrl->member_seq = pvt_seq;
+        if(!comms[i].is_coll_leader) {
+            break;
+        }
+    }
+
+    xhc_comm_t *src_comm = xhc_bcast_src_comm(comms, comm_count);
+    xhc_comm_ctrl_t *src_ctrl = src_comm->comm_ctrl;
+
+    WAIT_FLAG(&src_ctrl->coll_seq, pvt_seq, 0);
+    xhc_atomic_rmb();
+
+    if(!do_cico) {
+        src_buffer = src_ctrl->data_vaddr;
+    } else {
+        src_buffer = xhc_get_cico(peer_info, src_ctrl->cico_id);
+        if(src_buffer == NULL) return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    size_t bytes_done = 0;
+    size_t bytes_available = 0;
+
+    while(bytes_done < bytes_total) {
+        size_t copy_size = opal_min(src_comm->chunk_size, bytes_total - bytes_done);
+
+        void *data_dst = (char *) buf + bytes_done;
+        void *data_src = (char *) src_buffer + bytes_done;
+        void *data_cico_dst = (char *) local_cico + bytes_done;
+
+        if(bytes_available < copy_size) {
+            do {
+                volatile xf_size_t *brp = &src_ctrl->bytes_ready;
+                bytes_available = xhc_atomic_load_size_t(brp) - bytes_done;
+            } while(bytes_available < copy_size);
+
+            // Wait on loads inside the loop
+            xhc_atomic_rmb();
+        }
+
+        /* Pipelining is not necessary on the bottom
+         * level, copy all available at once */
+        if(!comms[0].is_coll_leader) {
+            copy_size = bytes_available;
+        }
+
+        if(!do_cico) {
+            int err = xhc_copy_from(&peer_info[src_ctrl->leader_rank],
+                data_dst, data_src, copy_size, src_ctrl->access_token);
+            if(err != 0) {
+                return OMPI_ERROR;
+            }
+        } else {
+            memcpy((comms[0].is_coll_leader
+                ? data_cico_dst : data_dst), data_src, copy_size);
+        }
+
+        bytes_done += copy_size;
+        bytes_available -= copy_size;
+
+        /* Do make sure the memcpy has completed before
+         * writing to the peers' bytes_ready. */
+        xhc_atomic_wmb();
+
+        xhc_bcast_children_set_bytes_ready(comms, comm_count, bytes_done);
+
+        if(do_cico && comms[0].is_coll_leader) {
+            memcpy(data_dst, data_cico_dst, copy_size);
+        }
+    }
+
+    if(!do_cico) {
+        xhc_copy_close_region(region_data);
+    }
+
+    coll_finish:
+
+    /* No wmb() necessary before sending ACK, as all operations
+     * that should be waited on (reads from shared buffers) have
+     * explicit barriers following them. */
+
+    xhc_bcast_do_ack(comms, comm_count, pvt_seq);
+
+    return OMPI_SUCCESS;
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_component.c b/ompi/mca/coll/xhc/coll_xhc_component.c
new file mode 100644
index 00000000000..dac4fd3db2d
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_component.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+
+#include "opal/mca/shmem/base/base.h"
+#include "opal/util/show_help.h"
+
+#include "coll_xhc.h"
+
+typedef int (*csv_parse_conv_fn_t)(char *str, void *dst);
+typedef void (*csv_parse_destruct_fn_t)(void *data);
+
+static int xhc_register(void);
+
+const char *mca_coll_xhc_component_version_string =
+    "Open MPI xhc collective MCA component version " OMPI_VERSION;
+
+static const char *hwloc_topo_str[] = {
+    "node", "flat",
+    "socket",
+    "numa",
+    "l3", "l3cache",
+    "l2", "l2cache",
+    "l1", "l1cache",
+    "core",
+    "hwthread", "thread"
+};
+
+static const xhc_loc_t hwloc_topo_val[] = {
+    OPAL_PROC_ON_NODE, OPAL_PROC_ON_NODE,
+    OPAL_PROC_ON_SOCKET,
+    OPAL_PROC_ON_NUMA,
+    OPAL_PROC_ON_L3CACHE, OPAL_PROC_ON_L3CACHE,
+    OPAL_PROC_ON_L2CACHE, OPAL_PROC_ON_L2CACHE,
+    OPAL_PROC_ON_L1CACHE, OPAL_PROC_ON_L1CACHE,
+    OPAL_PROC_ON_CORE,
+    OPAL_PROC_ON_HWTHREAD, OPAL_PROC_ON_HWTHREAD
+};
+
+mca_coll_xhc_component_t mca_coll_xhc_component = {
+    .super = {
+        .collm_version = {
+            MCA_COLL_BASE_VERSION_2_4_0,
+
+            .mca_component_name = "xhc",
+            MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION,
+                OMPI_MINOR_VERSION, OMPI_RELEASE_VERSION),
+
+            .mca_register_component_params = xhc_register,
+        },
+
+        .collm_data = {
+            MCA_BASE_METADATA_PARAM_CHECKPOINT
+        },
+
+        .collm_init_query = mca_coll_xhc_component_init_query,
+        .collm_comm_query = mca_coll_xhc_module_comm_query,
+    },
+
+    .priority = 0,
+    .print_info = false,
+
+    .shmem_backing = NULL,
+
+    .dynamic_leader = false,
+
+    .barrier_root = 0,
+
+    .dynamic_reduce = OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT,
+    .lb_reduce_leader_assist =
+        (OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK),
+
+    .force_reduce = false,
+
+    .cico_max = 1024,
+
+    .uniform_chunks = true,
+    .uniform_chunks_min = 1024,
+
+    /* These are the parameters that will need
+     * processing, and their default values. */
+    .hierarchy_mca = "numa,socket",
+    .chunk_size_mca = "16K"
+};
+
+/* Initial query function that is invoked during MPI_INIT, allowing
+ * this component to disqualify itself if it doesn't support the
+ * required level of thread support. */
+int mca_coll_xhc_component_init_query(bool enable_progress_threads,
+        bool enable_mpi_threads) {
+
+    return OMPI_SUCCESS;
+}
+
+static mca_base_var_enum_value_t dynamic_reduce_options[] = {
+    {OMPI_XHC_DYNAMIC_REDUCE_DISABLED, "disabled"},
+    {OMPI_XHC_DYNAMIC_REDUCE_NON_FLOAT, "non-float"},
+    {OMPI_XHC_DYNAMIC_REDUCE_ALL, "all"},
+    {0, NULL}
+};
+
+static mca_base_var_enum_value_flag_t lb_reduce_leader_assist_options[] = {
+    {OMPI_XHC_LB_RLA_TOP_LEVEL, "top", OMPI_XHC_LB_RLA_ALL},
+    {OMPI_XHC_LB_RLA_FIRST_CHUNK, "first", OMPI_XHC_LB_RLA_ALL},
+    {OMPI_XHC_LB_RLA_ALL, "all",
+        (OMPI_XHC_LB_RLA_TOP_LEVEL | OMPI_XHC_LB_RLA_FIRST_CHUNK)},
+    {0, NULL, 0}
+};
+
+static int xhc_register(void) {
+    mca_base_var_enum_t *var_enum;
+    mca_base_var_enum_flag_t *var_enum_flag;
+    char *tmp, *desc;
+    int ret;
+
+    /* Priority */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "priority", "Priority of the xhc component",
+        MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_2,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.priority);
+
+    /* Info */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "print_info", "Print information during initialization",
+        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.print_info);
+
+    /* SHM Backing dir */
+
+    mca_coll_xhc_component.shmem_backing = (access("/dev/shm", W_OK) == 0 ?
+        "/dev/shm" : opal_process_info.job_session_dir);
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "shmem_backing", "Directory to place backing files for shared-memory"
+        " control-data communication", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
+        OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY,
+        &mca_coll_xhc_component.shmem_backing);
+
+    /* Dynamic leader */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "dynamic_leader", "Enable dynamic operation-wise group-leader selection",
+        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_leader);
+
+    /* Dynamic reduce */
+
+    ret = mca_base_var_enum_create("coll_xhc_dynamic_reduce_options",
+        dynamic_reduce_options, &var_enum);
+    if(ret != OPAL_SUCCESS) {
+        return ret;
+    }
+
+    /* Barrier root */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "barrier_root", "Internal root for the barrier operation (rank ID)",
+        MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.barrier_root);
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "dynamic_reduce", "Dynamic/out-of-order intra-group reduction",
+        MCA_BASE_VAR_TYPE_INT, var_enum, 0, 0, OPAL_INFO_LVL_6,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.dynamic_reduce);
+
+    OBJ_RELEASE(var_enum);
+
+    /* Load balancing: Reduce leader assistance */
+
+    ret = mca_base_var_enum_create_flag("coll_xhc_lb_reduce_leader_assist",
+        lb_reduce_leader_assist_options, &var_enum_flag);
+    if(ret != OPAL_SUCCESS) {
+        return ret;
+    }
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "lb_reduce_leader_assist", "Reduction leader assistance modes for load balancing",
+        MCA_BASE_VAR_TYPE_INT, &var_enum_flag->super, 0, 0, OPAL_INFO_LVL_6,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.lb_reduce_leader_assist);
+
+    OBJ_RELEASE(var_enum_flag);
+
+    /* Force enable "hacky" reduce */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "force_reduce", "Force enable the \"special\" Reduce for all calls",
+        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.force_reduce);
+
+    /* Hierarchy features */
+
+    desc = NULL;
+
+    for(size_t i = 0; i < sizeof(hwloc_topo_str)/sizeof(char *); i++) {
+        ret = opal_asprintf(&tmp, "%s%s%s", (i > 0 ? desc : ""),
+            (i > 0 ? ", " : ""), hwloc_topo_str[i]);
+        free(desc); desc = tmp;
+        if(ret < 0) {
+            return OPAL_ERR_OUT_OF_RESOURCE;
+        }
+    }
+
+    ret = opal_asprintf(&tmp, "Comma-separated list of topology features to "
+        "consider for the hierarchy (%s)", desc);
+    free(desc); desc = tmp;
+    if(ret < 0) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "hierarchy", desc, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_4,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.hierarchy_mca);
+
+    free(desc);
+
+    /* Chunk size(s) */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "chunk_size", "The chunk size(s) to be used for the pipeline "
+        "(single value, or comma separated list for different hierarchy levels "
+        "(bottom to top))",
+        MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.chunk_size_mca);
+
+    /* Allreduce uniform chunks */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "uniform_chunks", "Automatically optimize chunk size in reduction "
+        "collectives according to message size, for load balancing",
+        MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.uniform_chunks);
+
+    /* Allreduce uniform chunks min size */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "uniform_chunks_min", "Minimum chunk size for reduction collectives, "
+        "when \"uniform chunks\" are enabled", MCA_BASE_VAR_TYPE_SIZE_T,
+        NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
+        &mca_coll_xhc_component.uniform_chunks_min);
+
+    /* CICO threshold (inclusive) */
+
+    (void) mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
+        "cico_max", "Maximum message size up to which to use CICO",
+        MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_5,
+        MCA_BASE_VAR_SCOPE_READONLY, &mca_coll_xhc_component.cico_max);
+
+    return OMPI_SUCCESS;
+}
+
+static int parse_csv(const char *csv_orig, char sep, char ignore_start,
+        char ignore_end, void **vals_dst, int *len_dst, size_t type_size,
+        csv_parse_conv_fn_t conv_fn, csv_parse_destruct_fn_t destructor_fn,
+        char *err_help_header) {
+
+    if(csv_orig == NULL || strlen(csv_orig) == 0) {
+        *vals_dst = NULL;
+        *len_dst = 0;
+        return OMPI_SUCCESS;
+    }
+
+    char *csv = NULL;
+    void *vals = NULL;
+
+    int vals_size = 0;
+    int ntokens = 0;
+
+    int return_code = OMPI_SUCCESS;
+
+    if(!(csv = strdup(csv_orig))) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    if(!(vals = malloc((vals_size = 5) * type_size))) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    int ignore_cnt = 0;
+    char *token = csv;
+
+    int csv_len = strlen(csv);
+
+    for(int i = 0; i < csv_len + 1; i++) {
+        char *c = csv+i;
+
+        if(ntokens == vals_size) {
+            void *tmp = realloc(vals, (vals_size *= 2) * sizeof(type_size));
+            if(!tmp) {
+                RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+            }
+            vals = tmp;
+        }
+
+        if(ignore_start != 0) {
+            if(*c == ignore_start) {
+                ignore_cnt++;
+            } else if(*c == ignore_end) {
+                ignore_cnt--;
+            }
+
+            if(ignore_cnt < 0) {
+                RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
+            }
+        }
+
+        if(ignore_cnt == 0 && (*c == sep || *c == '\0')) {
+            char oldc = *c;
+            *c = '\0';
+
+            int status = conv_fn(token, (char *) vals + ntokens*type_size);
+
+            if(status != OMPI_SUCCESS) {
+                if(err_help_header) {
+                    opal_show_help("help-coll-xhc.txt",
+                        err_help_header, true, token, csv_orig);
+                }
+
+                RETURN_WITH_ERROR(return_code, status, end);
+            }
+
+            ntokens++;
+
+            *c = oldc;
+            token = c + 1;
+        }
+    }
+
+    *vals_dst = vals;
+    *len_dst = ntokens;
+
+    end:
+
+    free(csv);
+
+    if(return_code != OMPI_SUCCESS) {
+        if(vals && destructor_fn) {
+            for(int i = 0; i < ntokens; i++) {
+                destructor_fn((char *) vals + i*type_size);
+            }
+        }
+
+        free(vals);
+    }
+
+    return return_code;
+}
+
+static int conv_xhc_loc_def_rank_list(char *str, void *result) {
+    char *strs[2] = {str, NULL};
+    int nums[2] = {-1, -1};
+
+    char *range_op_pos = NULL;
+
+    int return_code = OMPI_SUCCESS;
+
+    if((range_op_pos = strstr(str, ".."))) {
+        strs[1] = range_op_pos + 2;
+        *range_op_pos = '\0';
+    }
+
+    for(int i = 0; i < 2 && strs[i]; i++) {
+        char *endptr;
+
+        nums[i] = strtol(strs[i], &endptr, 10);
+
+        if(endptr[0] != '\0' || nums[i] < 0) {
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
+        }
+    }
+
+    ((xhc_rank_range_t *) result)->start_rank = nums[0];
+    ((xhc_rank_range_t *) result)->end_rank = (nums[1] != -1 ? nums[1] : nums[0]);
+
+    end:
+
+    if(range_op_pos) {
+        *range_op_pos = '.';
+    }
+
+    return return_code;
+}
+
+static void mca_coll_xhc_loc_def_construct(xhc_loc_def_t *def) {
+    def->named_loc = 0;
+    def->rank_list = NULL;
+    def->rank_list_len = 0;
+    def->split = 0;
+    def->max_ranks = 0;
+    def->repeat = false;
+}
+
+static void mca_coll_xhc_loc_def_destruct(xhc_loc_def_t *def) {
+    free(def->rank_list);
+}
+
+OBJ_CLASS_INSTANCE(xhc_loc_def_t, opal_list_item_t,
+    mca_coll_xhc_loc_def_construct, mca_coll_xhc_loc_def_destruct);
+
+static int conv_xhc_loc_def(char *str, void *result) {
+    int return_code = OMPI_SUCCESS;
+
+    char *s = strdup(str);
+    xhc_loc_def_t *def = OBJ_NEW(xhc_loc_def_t);
+
+    if(!s || !def) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    /* Parse modifiers and remove them from string */
+
+    if(s[strlen(s) - 1] == '*') {
+        def->repeat = true;
+        s[strlen(s) - 1] = '\0';
+    }
+
+    char *colon_pos = strrchr(s, ':');
+    char *qmark_pos = strrchr(s, '?');
+
+    if(colon_pos && qmark_pos) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
+    } else if(colon_pos || qmark_pos) {
+        char *numstr = (colon_pos ? colon_pos : qmark_pos);
+        char *endptr;
+
+        int num = strtol(numstr + 1, &endptr, 10);
+
+        if(endptr[0] != '\0' || num <= 0) {
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
+        }
+
+        if(colon_pos) {
+            def->split = num;
+        } else {
+            def->max_ranks = num;
+        }
+
+        *numstr = '\0';
+    }
+
+    /* Parse locality definition */
+
+    if(s[0] == '[') {
+        if(def->repeat) { // repeat only makes sense with named localities
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
+        }
+
+        s[strlen(s) - 1] = '\0';
+
+        int status = parse_csv(s+1, ',', 0, 0, (void **) &def->rank_list,
+            &def->rank_list_len, sizeof(xhc_rank_range_t),
+            conv_xhc_loc_def_rank_list, NULL, NULL);
+
+        if(status != OMPI_SUCCESS) {
+            RETURN_WITH_ERROR(return_code, status, end);
+        }
+    } else {
+        bool found = false;
+
+        for(size_t i = 0; i < sizeof(hwloc_topo_str)/sizeof(char *); i++) {
+            if(strcasecmp(s, hwloc_topo_str[i]) == 0) {
+                def->named_loc = hwloc_topo_val[i];
+                found = true;
+                break;
+            }
+        }
+
+        if(!found) {
+            RETURN_WITH_ERROR(return_code, OMPI_ERR_BAD_PARAM, end);
+        }
+    }
+
+    * (xhc_loc_def_t **) result = def;
+
+    end:
+
+    free(s);
+
+    if(return_code != OMPI_SUCCESS) {
+        OBJ_RELEASE_IF_NOT_NULL(def);
+    }
+
+    return return_code;
+}
+
+static void destruct_xhc_loc_def(void *data) {
+    OBJ_RELEASE(* (xhc_loc_def_t **) data);
+}
+
+static int conv_xhc_loc_def_combination(char *str, void *result) {
+    xhc_loc_def_t **defs;
+    int ndefs;
+
+    int status = parse_csv(str, '+', 0, 0, (void **) &defs,
+        &ndefs, sizeof(xhc_loc_def_t *), conv_xhc_loc_def,
+        destruct_xhc_loc_def, NULL);
+    if(status != OMPI_SUCCESS) {
+        return status;
+    }
+
+    opal_list_t *def_list = (opal_list_t *) result;
+    OBJ_CONSTRUCT(def_list, opal_list_t);
+
+    for(int i = 0; i < ndefs; i++) {
+        opal_list_append(def_list, (opal_list_item_t *) defs[i]);
+    }
+
+    free(defs);
+
+    return OMPI_SUCCESS;
+}
+
+static void destruct_xhc_loc_def_combination(void *data) {
+    OPAL_LIST_DESTRUCT((opal_list_t *) data);
+}
+
+int mca_coll_xhc_component_parse_hierarchy(const char *val_str,
+        opal_list_t **level_defs_dst, int *nlevel_defs_dst) {
+
+    /* The hierarchy is in a comma-separated list format. Each item in the
+     * list specifies how to group ranks, and each different item entails
+     * a grouping step.
+     *
+     * Each item in this list is a '+'-separated list. Of course, this can
+     * be just one item, without any delimiter, specifying the locality to
+     * follow for the grouping (e.g. numa, socket, etc).
+     *
+     * But, it can also be more complex (multiple '+'-separated items), used
+     * to describe virtual hierarchies. This allows to group different ranks
+     * in different ways, e.g. some ranks according to numa, then others by
+     * something else, etc.
+     *
+     * Each item in this '+'-separated list, can be of the following types:
+     * 1. A "named locality", e.g. hwloc's localities (only ones currently
+     *    available), see hwloc_topo_str[].
+     * 2. A list of ranks that should be grouped together. This is a comma-
+     *    separated list of integers, enclosed in [] (I know, list-ception!).
+     *    It may also contain range operators (..), to select multiple ranks
+     *    at once (e.g. 0..3 expands to 0,1,2,3). Example: [0..15,20,22].
+     *    The order of the ranks does not matter.
+     *
+     * Finally, each such item may be suffixed by a special modifier:
+     * 1. The split modifier (:<n>) specifies to group according to the
+     *    locality it refers to, but to split each such group into multiple
+     *    parts. E.g. the locality 'numa:2' will group ranks into half-numas
+     *    group, such that for each NUMA node, half the ranks are in one
+     *    group, and the rest are in another.
+     * 2. The max-ranks modifier (?<n>) works similarly to the split modifier,
+     *    only that it specifies that at most _n_ ranks should be placed in
+     *    each group. If more than _n_ ranks share the locality the modifier
+     *    refers to, multiple groups will be created for these ranks, each one
+     *    not more than _n_ ranks in size.
+     * 3. The repeat modifier (*), which can be specified along with the two
+     *    previous modifiers, allows manual control over the repetition of
+     *    named localities. See below, under 'repetition'.
+     *
+     * Repetition:
+     *   Named localities are repeated for all distinct rank clusters. For
+     *   example, "numa", even though it is a single key, means to group
+     *   all ranks that are in the same NUMA together, which will lead to
+     *   multiple groups if multiple NUMA nodes are present. This is in
+     *   contract to rank lists, which only create a single group, containing
+     *   the ranks specified in it. The different items in the '+'-separated
+     *   list are consumed in-order left-to-right, and any named localities
+     *   are automatically repeated to apply all ranks that are not included
+     *   in other items. When multiple named localities are present one after
+     *   the other, the last one is repeated, unless another repetition was
+     *   explicitly requested via the repeat modifier.
+     *
+     * Examples:
+     *   "numa": Group according to numa locality
+     *   "numa,socket": Group according to numa and then socket locality
+     *   "node"/"flat": Group according to node locality -> all ranks in
+     *     same node -> flat hierarchy i.e. none at all
+     *
+     *   "numa:2,socket": Group according to numa locality but with two
+     *     groups per NUMA, and then according to socket.
+     *   "numa:2,numa,socket": Similar to the previous one, but this case
+     *     will result in one of the two half-numa-leaders further becoming
+     *     the leader of the NUMA node.
+     *   "numa?10,socket": Group according to numa, but no more than 10 ranks
+     *     per NUMA; create multiple groups if necessary. Then group according
+     *     to socket.
+     *
+     *   "[0..9]+[10..24]": Create 2 groups: one for the first 10 ranks,
+     *     and another for the next 15 ones.
+     *   "[0..39]+numa,socket": Group the first 40 ranks, and the rest
+     *     according to numa locality. Then group according to socket.
+     *
+     *   "socket+socket:2": Create at least two groups: one for all ranks
+     *     in the first socket, and all the other ranks group them according
+     *     to socket locality, but with two groups for each socket.
+     *   "socket*+socket:2": Similar to the previous one, but only the last
+     *     socket is split into two groups, all the other ranks are grouped
+     *     according to socket locality.
+     *
+     * If the top-most locality specified does not cover all ranks, one such
+     * locality will automatically be added (in the hierarchy sort method).
+     *
+     * (Oh god what have I done! -Frankenstein, probably) */
+
+    int status = parse_csv(val_str, ',', '[', ']', (void **) level_defs_dst,
+        nlevel_defs_dst, sizeof(opal_list_t), conv_xhc_loc_def_combination,
+        destruct_xhc_loc_def_combination, "bad-hierarchy-item");
+
+    return status;
+}
+
+static int conv_chunk_size(char *str, void *result) {
+    size_t last_idx = strlen(str) - 1;
+    char saved_char = str[last_idx];
+
+    size_t mult = 1;
+
+    switch(str[last_idx]) {
+        case 'g': case 'G':
+            mult *= 1024;
+        case 'm': case 'M':
+            mult *= 1024;
+        case 'k': case 'K':
+            mult *= 1024;
+
+        str[last_idx] = '\0';
+    }
+
+    bool legal = (str[0] != '\0');
+
+    for(char *c = str; *c; c++) {
+        if((*c < '0' || *c > '9') && *c != '-') {
+            legal = false;
+            break;
+        }
+    }
+
+    if(legal) {
+        long long num = atoll(str) * mult;
+        * (size_t *) result = (size_t) (num > 0 ? num : -1);
+    }
+
+    str[last_idx] = saved_char;
+
+    return (legal ? OMPI_SUCCESS : OMPI_ERR_BAD_PARAM);
+}
+
+int mca_coll_xhc_component_parse_chunk_sizes(const char *val_str,
+        size_t **chunks_dst, int *len_dst) {
+
+    if(val_str == NULL) {
+        *chunks_dst = malloc(sizeof(size_t));
+        if(*chunks_dst == NULL) {
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        }
+
+        (*chunks_dst)[0] = (size_t) -1;
+        *len_dst = 1;
+
+        return OMPI_SUCCESS;
+    }
+
+    int status = parse_csv(val_str, ',', 0, 0, (void **) chunks_dst, len_dst,
+        sizeof(size_t), conv_chunk_size, NULL, "bad-chunk-size-item");
+
+    return status;
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_module.c b/ompi/mca/coll/xhc/coll_xhc_module.c
new file mode 100644
index 00000000000..879e521f662
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_module.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "mpi.h"
+
+#include "ompi/mca/coll/coll.h"
+#include "ompi/mca/coll/base/base.h"
+#include "opal/mca/smsc/smsc.h"
+
+#include "opal/util/arch.h"
+#include "opal/util/show_help.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+static int xhc_module_save_fallback_fns(
+    xhc_module_t *module, ompi_communicator_t *comm);
+
+static int xhc_module_create_hierarchy(mca_coll_xhc_module_t *module,
+    ompi_communicator_t *comm, opal_list_t *level_defs, int nlevel_defs,
+    xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst);
+
+static int xhc_module_sort_hierarchy(mca_coll_xhc_module_t *module,
+    ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst);
+
+// -----------------------------
+
+static void xhc_module_clear(xhc_module_t *module) {
+    memset(&module->prev_colls, 0, sizeof(module->prev_colls));
+
+    module->comm_size = 0;
+    module->rank = -1;
+
+    module->hierarchy_string = NULL;
+    module->hierarchy = NULL;
+    module->hierarchy_len = 0;
+
+    module->chunks = NULL;
+    module->chunks_len = 0;
+
+    module->rbuf = NULL;
+    module->rbuf_size = 0;
+
+    module->peer_info = NULL;
+    module->data = NULL;
+    module->init = false;
+}
+
+static void mca_coll_xhc_module_construct(mca_coll_xhc_module_t *module) {
+    xhc_module_clear(module);
+}
+
+static void mca_coll_xhc_module_destruct(mca_coll_xhc_module_t *module) {
+    xhc_fini(module);
+
+    free(module->hierarchy_string);
+    free(module->hierarchy);
+    free(module->chunks);
+    free(module->rbuf);
+    free(module->peer_info);
+
+    xhc_module_clear(module);
+}
+
+OBJ_CLASS_INSTANCE(mca_coll_xhc_module_t, mca_coll_base_module_t,
+    mca_coll_xhc_module_construct, mca_coll_xhc_module_destruct);
+
+// -----------------------------
+
+mca_coll_base_module_t *mca_coll_xhc_module_comm_query(ompi_communicator_t *comm,
+        int *priority) {
+
+    if((*priority = mca_coll_xhc_component.priority) < 0) {
+        return NULL;
+    }
+
+    if(OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) == 1
+            || ompi_group_have_remote_peers (comm->c_local_group)) {
+
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc:comm_query (%s/%s): intercomm, self-comm, "
+            "or not all ranks local; disqualifying myself",
+            ompi_comm_print_cid(comm), comm->c_name);
+
+        return NULL;
+    }
+
+    int comm_size = ompi_comm_size(comm);
+    for(int r = 0; r < comm_size; r++) {
+        ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r);
+
+        if(proc->super.proc_arch != opal_local_arch) {
+            opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+                ompi_coll_base_framework.framework_output,
+                "coll:xhc:comm_query (%s/%s): All ranks not of the same arch; "
+                "disabling myself", ompi_comm_print_cid(comm), comm->c_name);
+
+            return NULL;
+        }
+    }
+
+    mca_coll_base_module_t *module =
+        (mca_coll_base_module_t *) OBJ_NEW(mca_coll_xhc_module_t);
+
+    if(module == NULL) {
+        return NULL;
+    }
+
+    module->coll_module_enable = mca_coll_xhc_module_enable;
+    module->coll_module_disable = mca_coll_xhc_module_disable;
+
+    module->coll_barrier = mca_coll_xhc_barrier;
+
+    if(mca_smsc == NULL) {
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Warning: No opal/smsc support found; "
+            "only barrier will be enabled");
+
+        return module;
+    }
+
+    module->coll_bcast = mca_coll_xhc_bcast;
+
+    if(!mca_smsc_base_has_feature(MCA_SMSC_FEATURE_CAN_MAP)) {
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Warning: opal/smsc module is not CAN_MAP capable; "
+            "(all)reduce will be disabled, bcast might see reduced performance");
+
+        return module;
+    }
+
+    module->coll_allreduce = mca_coll_xhc_allreduce;
+    module->coll_reduce = mca_coll_xhc_reduce;
+
+    return module;
+}
+
+#define COLL_FN_HELPER(_m, _api) .coll_ ## _api = (_m)->coll_ ## _api, \
+    .coll_ ## _api ## _module = (_m)
+
+int mca_coll_xhc_module_enable(mca_coll_base_module_t *ompi_module,
+        ompi_communicator_t *comm) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    int ret;
+
+    // ---
+
+    ret = xhc_module_save_fallback_fns(module, comm);
+
+    /* This can/will happen often (see #9885), but theoretically
+     * isn't a problem, as in these cases the component wouldn't
+     * end up getting used anyway. */
+    if(ret != OMPI_SUCCESS) {
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc:module_enable (%s/%s): No previous fallback component "
+            "found; disabling myself", ompi_comm_print_cid(comm), comm->c_name);
+
+        return ret;
+    }
+
+    // ---
+
+    module->comm_size = ompi_comm_size(comm);
+    module->rank = ompi_comm_rank(comm);
+
+    module->peer_info = calloc(module->comm_size, sizeof(xhc_peer_info_t));
+
+    for(int r = 0; r < module->comm_size; r++) {
+        ompi_proc_t *peer_proc = ompi_comm_peer_lookup(comm, r);
+
+        module->peer_info[r].proc = peer_proc;
+        module->peer_info[r].locality = peer_proc->super.proc_flags;
+    }
+
+    module->peer_info[module->rank].locality |=
+        ((1 << OMPI_XHC_LOC_EXT_BITS) - 1) << OMPI_XHC_LOC_EXT_START;
+
+    // ---
+
+    /* This needs to happen here, and we need to save the hierarchy string,
+     * because the info value will have been gone by the time lazy_init is
+     * called. Furthermore, we can't prepeare the hierarchy here, as it might
+     * required communication (allgather) with the other ranks. */
+
+    const char *hier_mca = mca_coll_xhc_component.hierarchy_mca;
+
+    opal_cstring_t *hier_info;
+    int hier_info_flag = 0;
+
+    if(comm->super.s_info != NULL) {
+        opal_info_get(comm->super.s_info, "ompi_comm_coll_xhc_hierarchy",
+            &hier_info, &hier_info_flag);
+
+        if(hier_info_flag) {
+            hier_mca = hier_info->string;
+        }
+    }
+
+    module->hierarchy_string = strdup(hier_mca);
+
+    if(hier_info_flag) {
+        OBJ_RELEASE(hier_info);
+    }
+
+    if(!module->hierarchy_string) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    // ---
+
+    ret = xhc_component_parse_chunk_sizes(mca_coll_xhc_component.chunk_size_mca,
+        &module->chunks, &module->chunks_len);
+    if(ret != OMPI_SUCCESS) {
+        return ret;
+    }
+
+    // ---
+
+    xhc_coll_fns_t xhc_fns = (xhc_coll_fns_t) {
+        COLL_FN_HELPER(ompi_module, allreduce),
+        COLL_FN_HELPER(ompi_module, barrier),
+        COLL_FN_HELPER(ompi_module, bcast),
+        COLL_FN_HELPER(ompi_module, reduce)
+    };
+
+    xhc_module_install_fns(module, comm, xhc_fns);
+
+    return OMPI_SUCCESS;
+}
+
+int mca_coll_xhc_module_disable(mca_coll_base_module_t *ompi_module,
+        ompi_communicator_t *comm) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    xhc_module_install_fallback_fns(module, comm, NULL);
+    mca_coll_xhc_module_destruct(module);
+
+    return OMPI_SUCCESS;
+}
+
+// -----------------------------
+
+#define SAVE_FALLBACK_COLL(_comm, _m, _dst, _api) do { \
+    if((_m)->coll_ ## _api) { \
+        MCA_COLL_SAVE_API(_comm, _api, (_dst).coll_ ## _api, \
+            (_dst).coll_ ## _api ## _module, "xhc"); \
+        \
+        if(!(_dst).coll_ ## _api || !(_dst).coll_ ## _api ## _module) { \
+            _save_status = OMPI_ERR_NOT_FOUND; \
+        } \
+    } \
+} while(0)
+
+#define INSTALL_FALLBACK_COLL(_comm, _m, _saved, _new, _api) do { \
+    if((_comm)->c_coll->coll_ ## _api ## _module == (_m)) { \
+        MCA_COLL_SAVE_API(_comm, _api, (_saved).coll_ ## _api, \
+            (_saved).coll_ ## _api ## _module, "xhc"); \
+        MCA_COLL_INSTALL_API(_comm, _api, (_new).coll_ ## _api, \
+            (_new).coll_ ## _api ## _module, "xhc"); \
+    } \
+} while(0)
+
+#define INSTALL_COLL(_comm, _src, _api) do { \
+    if((_src).coll_ ## _api) { \
+        MCA_COLL_INSTALL_API(_comm, _api, (_src).coll_ ## _api, \
+            (_src).coll_ ## _api ## _module, "xhc"); \
+    } \
+} while(0)
+
+/* Save the function pointers of the previous module, in XHC's
+ * struct. Only the functions that XHC will provide are saved. */
+static int xhc_module_save_fallback_fns(
+        xhc_module_t *module, ompi_communicator_t *comm) {
+
+    mca_coll_base_module_t *ompi_module = (mca_coll_base_module_t *) module;
+
+    xhc_coll_fns_t colls = {0};
+    int _save_status = OMPI_SUCCESS;
+
+    SAVE_FALLBACK_COLL(comm, ompi_module, colls, allreduce);
+    SAVE_FALLBACK_COLL(comm, ompi_module, colls, barrier);
+    SAVE_FALLBACK_COLL(comm, ompi_module, colls, bcast);
+    SAVE_FALLBACK_COLL(comm, ompi_module, colls, reduce);
+
+    if(_save_status == OMPI_SUCCESS) {
+        module->prev_colls = colls;
+    }
+
+    return _save_status;
+}
+
+/* Replace XHC's pointers in c_coll with those from the fallback
+ * component saved earlier. XHC's pointers are conveniently returned
+ * in prev_fns_dst, to later pass to xhc_module_install_fns. */
+void mca_coll_xhc_module_install_fallback_fns(xhc_module_t *module,
+        ompi_communicator_t *comm, xhc_coll_fns_t *prev_fns_dst) {
+
+    mca_coll_base_module_t *ompi_module = (mca_coll_base_module_t *) module;
+
+    xhc_coll_fns_t saved = {0};
+
+    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, allreduce);
+    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, barrier);
+    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, bcast);
+    INSTALL_FALLBACK_COLL(comm, ompi_module, saved, module->prev_colls, reduce);
+
+    if(prev_fns_dst) {
+        *prev_fns_dst = saved;
+    }
+}
+
+/*  */
+void mca_coll_xhc_module_install_fns(xhc_module_t *module,
+        ompi_communicator_t *comm, xhc_coll_fns_t fns) {
+
+    (void) module;
+
+    INSTALL_COLL(comm, fns, allreduce);
+    INSTALL_COLL(comm, fns, barrier);
+    INSTALL_COLL(comm, fns, bcast);
+    INSTALL_COLL(comm, fns, reduce);
+}
+
+// -----------------------------
+
+int mca_coll_xhc_module_prepare_hierarchy(
+        xhc_module_t *module, ompi_communicator_t *comm) {
+
+    int ret;
+
+    opal_list_t *level_defs;
+    int nlevel_defs;
+
+    ret = xhc_component_parse_hierarchy(module->hierarchy_string,
+        &level_defs, &nlevel_defs);
+    if(ret != OMPI_SUCCESS) {
+        return ret;
+    }
+
+    ret = xhc_module_create_hierarchy(module, comm, level_defs,
+        nlevel_defs, &module->hierarchy, &module->hierarchy_len);
+    if(ret != OMPI_SUCCESS) {
+        return ret;
+    }
+
+    for(int i = 0; i < nlevel_defs; i++)
+        OPAL_LIST_DESTRUCT(&level_defs[i]);
+    free(level_defs);
+
+    ret = xhc_module_sort_hierarchy(module, comm,
+        &module->hierarchy, &module->hierarchy_len);
+    if(ret != OMPI_SUCCESS) {
+        return ret;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+static int xhc_module_create_hierarchy(xhc_module_t *module,
+        ompi_communicator_t *comm, opal_list_t *level_defs, int nlevel_defs,
+        xhc_loc_t **hierarchy_dst, int *hierarchy_len_dst) {
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+
+    int comm_size = ompi_comm_size(comm);
+    int rank = ompi_comm_rank(comm);
+
+    xhc_loc_t *hierarchy = NULL;
+    int nvirt_hiers = 0;
+
+    int *rank_list;
+
+    opal_hwloc_locality_t *loc_list;
+    ompi_datatype_t *hwloc_locality_type = NULL;
+
+    int ret, return_code = OMPI_SUCCESS;
+
+    hierarchy = malloc(nlevel_defs * sizeof(xhc_loc_t));
+    rank_list = malloc(comm_size * sizeof(int));
+    loc_list = malloc(comm_size * sizeof(opal_hwloc_locality_t));
+
+    if(!hierarchy || !rank_list || !loc_list) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    switch(sizeof(opal_hwloc_locality_t)) {
+        case 1: hwloc_locality_type = MPI_UINT8_T; break;
+        case 2: hwloc_locality_type = MPI_UINT16_T; break;
+        case 4: hwloc_locality_type = MPI_UINT32_T; break;
+        case 8: hwloc_locality_type = MPI_UINT64_T; break;
+    }
+    assert(hwloc_locality_type);
+
+    for(int h = 0; h < nlevel_defs; h++) {
+        opal_list_t *defs = &level_defs[h];
+
+        xhc_loc_def_t *my_def = NULL;
+        xhc_loc_t locality;
+
+        xhc_loc_def_t *def_0 = (xhc_loc_def_t *) opal_list_get_first(defs);
+
+        bool is_virtual = (opal_list_get_size(defs) > 1 || def_0->rank_list
+            || def_0->split > 1 || def_0->max_ranks > 0);
+
+        if(is_virtual) {
+            if(nvirt_hiers == OMPI_XHC_LOC_EXT_BITS) {
+                opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+                    ompi_coll_base_framework.framework_output,
+                    "coll:xhc: Error: Too many virtual hierarchies");
+
+                RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end);
+            }
+
+            locality = 1 << (OMPI_XHC_LOC_EXT_START + nvirt_hiers);
+            nvirt_hiers++;
+        } else {
+            locality = def_0->named_loc;
+        }
+
+        hierarchy[h] = locality;
+        def_0 = NULL;
+
+        xhc_loc_def_t *def, *def_next;
+
+        /* Handle rank lists; take note if I belong
+         * in one, and remove them from the mix */
+        OPAL_LIST_FOREACH_SAFE(def, def_next, defs, xhc_loc_def_t) {
+            if(def->rank_list) {
+                if(!my_def) {
+                    for(int rl = 0; rl < def->rank_list_len; rl++) {
+                        if(rank >= def->rank_list[rl].start_rank
+                                && rank <= def->rank_list[rl].end_rank) {
+                            my_def = def;
+                            break;
+                        }
+                    }
+                }
+
+                opal_list_remove_item(defs, (opal_list_item_t *) def);
+                if(def != my_def) {
+                    OBJ_RELEASE(def);
+                }
+            }
+        }
+
+        bool dir_fwd = true;
+
+        /* When multiple locality defitions are present, they are assigned
+         * to groups in a left-to-right fashion. At every turn, the first
+         * rank (determined by the minimum ID) that's still not part of
+         * a locality, as well as the other ranks that are local with it,
+         * claim/consume the next locality from the list. The direction
+         * serves to implement the repeat modifier. When it is located,
+         * the process starts taking place right-to-left following the max
+         * ID. At the end and after the loop, the repeated locality will
+         * be the only one left and all remaining ranks will follow it. */
+        while(opal_list_get_size(defs) > 1) {
+            def = (xhc_loc_def_t *) (dir_fwd ? opal_list_get_first(defs)
+                : opal_list_get_last(defs));
+
+            if(dir_fwd && def->repeat) {
+                dir_fwd = false;
+                continue;
+            }
+
+            int ticket = (my_def == NULL ? rank : (dir_fwd ? comm_size : -1));
+            int chosen;
+
+            ret = comm->c_coll->coll_allreduce(&ticket, &chosen, 1,
+                MPI_INT, (dir_fwd ? MPI_MIN : MPI_MAX), comm,
+                comm->c_coll->coll_allreduce_module);
+            if(ret != OMPI_SUCCESS) {
+                RETURN_WITH_ERROR(return_code, ret, end);
+            }
+
+            if(chosen >= 0 && chosen < comm_size
+                    && PEER_IS_LOCAL(peer_info, chosen, def->named_loc)) {
+
+                my_def = def;
+            }
+
+            opal_list_remove_item(defs, (opal_list_item_t *) def);
+            if(def != my_def) {
+                OBJ_RELEASE(def);
+            }
+        }
+
+        if(opal_list_get_size(defs) > 0 && !my_def) {
+            my_def = (xhc_loc_def_t *) opal_list_get_first(defs);
+            opal_list_remove_item(defs, (opal_list_item_t *) my_def);
+        }
+
+        /* Share which named locality each rank follows; ranks that
+         * follow different localities shouldn't be grouped together */
+        opal_hwloc_locality_t follow_loc = (my_def ? my_def->named_loc : 0);
+        ret = comm->c_coll->coll_allgather(&follow_loc, 1,
+            hwloc_locality_type, loc_list, 1, hwloc_locality_type,
+            comm, comm->c_coll->coll_allgather_module);
+        if(ret != OMPI_SUCCESS) {
+            RETURN_WITH_ERROR(return_code, ret, end);
+        }
+
+        if(my_def == NULL) {
+            continue;
+        }
+
+        int member_id;
+        int members = 0;
+
+        // If working with rank list, set the ranks from the list as "local"
+        if(my_def->rank_list) {
+            for(int i = 0; i < my_def->rank_list_len; i++) {
+                for(int r = my_def->rank_list[i].start_rank;
+                        r <= my_def->rank_list[i].end_rank && r < comm_size; r++) {
+                    if(r == rank) {
+                        member_id = members;
+                    }
+
+                    peer_info[r].locality |= locality;
+                    rank_list[members++] = r;
+                }
+            }
+        } else if(is_virtual) {
+            /* We might have a named locality instead of a rank list, but if
+             * we still needed to create a virtual one, we need to apply it */
+            for(int r = 0; r < comm_size; r++) {
+                if(loc_list[r] != my_def->named_loc) {
+                    continue;
+                }
+
+                if(!PEER_IS_LOCAL(peer_info, r, my_def->named_loc)) {
+                    continue;
+                }
+
+                if(r == rank) {
+                    member_id = members;
+                }
+
+                peer_info[r].locality |= locality;
+                rank_list[members++] = r;
+            }
+        }
+
+        /* If split or max ranks was specified, math partition the locality
+         * and remove the previously added locality mapping to some ranks */
+        if(my_def->split > 1) {
+            int piece_size = members / my_def->split;
+            int leftover = members % my_def->split;
+
+            for(int m = 0, next_border = 0; m < members; m++) {
+                if(m == next_border) {
+                    next_border += piece_size + (leftover > 0 ? 1 : 0);
+                    if(leftover > 0) {
+                        leftover--;
+                    }
+
+                    if(member_id >= m && member_id < next_border) {
+                        m = next_border - 1;
+                        continue;
+                    }
+                }
+
+                peer_info[rank_list[m]].locality &= ~locality;
+            }
+        } else if(my_def->max_ranks > 1) {
+            for(int m = 0; m < members; m++) {
+                if(m % my_def->max_ranks == 0) {
+                    if(member_id >= m && member_id - m < my_def->max_ranks) {
+                        m += my_def->max_ranks - 1;
+                        continue;
+                    }
+                }
+
+                peer_info[rank_list[m]].locality &= ~locality;
+            }
+        }
+
+        OBJ_RELEASE_IF_NOT_NULL(my_def);
+    }
+
+    *hierarchy_dst = hierarchy;
+    *hierarchy_len_dst = nlevel_defs;
+
+end:
+
+    free(rank_list);
+
+    if(return_code != OMPI_SUCCESS) {
+        free(hierarchy);
+    }
+
+    return return_code;
+}
+
+static int xhc_module_sort_hierarchy(xhc_module_t *module,
+        ompi_communicator_t *comm, xhc_loc_t **hierarchy_dst,
+        int *hierarchy_len_dst) {
+
+    xhc_peer_info_t *peer_info = module->peer_info;
+    int comm_size = ompi_comm_size(comm);
+
+    xhc_loc_t *old_hier = *hierarchy_dst;
+    int hier_len = *hierarchy_len_dst;
+
+    xhc_loc_t *new_hier = NULL;
+    bool *hier_done = NULL;
+
+    int return_code = OMPI_SUCCESS;
+
+    new_hier = malloc((hier_len + 1) * sizeof(xhc_loc_t));
+    hier_done = calloc(hier_len, sizeof(bool));
+
+    if(new_hier == NULL || hier_done == NULL) {
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_OUT_OF_RESOURCE, end);
+    }
+
+    bool has_virtual = false;
+    for(int i = 0; i < hier_len; i++) {
+        if(old_hier[i] >= (1 << OMPI_XHC_LOC_EXT_START)) {
+            has_virtual = true;
+            break;
+        }
+    }
+
+    /* If any virtual hierarchy is involved, attempting to sort it is likely
+     * asking for trouble. Skip the sorting, and only consider adding a top
+     * common locality. There is a chance it wasn't actually necessary, but
+     * it never hurts. */
+
+    if(has_virtual) {
+        memcpy(new_hier, old_hier, hier_len * sizeof(xhc_loc_t));
+    } else {
+        for(int new_idx = hier_len - 1; new_idx >= 0; new_idx--) {
+            int max_matches_count = -1;
+            int max_matches_hier_idx = -1;
+
+            for(int i = 0; i < hier_len; i++) {
+                if(hier_done[i]) {
+                    continue;
+                }
+
+                int matches = 0;
+
+                for(int r = 0; r < comm_size; r++) {
+                    if(PEER_IS_LOCAL(peer_info, r, old_hier[i])) {
+                        matches++;
+                    }
+                }
+
+                if(matches > max_matches_count) {
+                    max_matches_count = matches;
+                    max_matches_hier_idx = i;
+                }
+            }
+
+            assert(max_matches_count != -1);
+
+            new_hier[new_idx] = old_hier[max_matches_hier_idx];
+            hier_done[max_matches_hier_idx] = true;
+        }
+    }
+
+    xhc_loc_t common_locality = (xhc_loc_t) -1;
+
+    for(int r = 0; r < comm_size; r++) {
+        ompi_proc_t *proc = ompi_comm_peer_lookup(comm, r);
+        common_locality &= proc->super.proc_flags;
+    }
+
+    if(common_locality == 0) {
+        opal_output_verbose(MCA_BASE_VERBOSE_COMPONENT,
+            ompi_coll_base_framework.framework_output,
+            "coll:xhc: Error: There is no locality common "
+            "to all ranks in the communicator");
+
+        RETURN_WITH_ERROR(return_code, OMPI_ERR_NOT_SUPPORTED, end);
+    }
+
+    if(hier_len == 0 || (common_locality & new_hier[hier_len - 1])
+            != new_hier[hier_len - 1]) {
+
+        new_hier[hier_len] = common_locality;
+        hier_len++;
+    }
+
+    REALLOC(new_hier, hier_len, xhc_loc_t);
+
+    free(old_hier);
+
+    *hierarchy_dst = new_hier;
+    *hierarchy_len_dst = hier_len;
+
+end:
+
+    free(hier_done);
+
+    if(return_code != OMPI_SUCCESS) {
+        free(new_hier);
+    }
+
+    return return_code;
+}
diff --git a/ompi/mca/coll/xhc/coll_xhc_reduce.c b/ompi/mca/coll/xhc/coll_xhc_reduce.c
new file mode 100644
index 00000000000..5f28986fb66
--- /dev/null
+++ b/ompi/mca/coll/xhc/coll_xhc_reduce.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "mpi.h"
+
+#include "ompi/constants.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/communicator/communicator.h"
+#include "ompi/op/op.h"
+
+#include "opal/mca/rcache/base/base.h"
+#include "opal/util/show_help.h"
+#include "opal/util/minmax.h"
+
+#include "coll_xhc.h"
+
+int mca_coll_xhc_reduce(const void *sbuf, void *rbuf,
+        int count, ompi_datatype_t *datatype, ompi_op_t *op, int root,
+        ompi_communicator_t *ompi_comm, mca_coll_base_module_t *ompi_module) {
+
+    xhc_module_t *module = (xhc_module_t *) ompi_module;
+
+    // Currently, XHC's reduce only supports root = 0
+    if(root == 0) {
+        return xhc_allreduce_internal(sbuf, rbuf, count,
+            datatype, op, ompi_comm, ompi_module, false);
+    } else {
+        xhc_coll_fns_t fallback = module->prev_colls;
+
+        return fallback.coll_reduce(sbuf, rbuf, count, datatype,
+            op, root, ompi_comm, fallback.coll_reduce_module);
+    }
+}
diff --git a/ompi/mca/coll/xhc/help-coll-xhc.txt b/ompi/mca/coll/xhc/help-coll-xhc.txt
new file mode 100644
index 00000000000..453a96df4fc
--- /dev/null
+++ b/ompi/mca/coll/xhc/help-coll-xhc.txt
@@ -0,0 +1,24 @@
+#
+# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
+#                         Laboratory, ICS Forth. All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+[bad-hierarchy-item]
+WARNING (coll/xhc)
+Unrecognized locality definition '%s' in hierarchy parameter string '%s'
+The component won't load
+#
+[bad-chunk-size-item]
+WARNING (coll/xhc)
+Malformed item '%s' in chunk size parameter string '%s'
+The component won't load
+#
+[xhc-init-failed]
+WARNING (coll/xhc)
+Component initialization failed with error code %d
+Errno: %d (%s)
diff --git a/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg b/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
new file mode 100755
index 00000000000..c8f6d8a2da3
--- /dev/null
+++ b/ompi/mca/coll/xhc/resources/xhc-hierarchy.svg
@@ -0,0 +1,1176 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="169.571mm"
+   height="119.894mm"
+   viewBox="0 0 169.571 119.89402"
+   version="1.1"
+   id="svg5"
+   inkscape:version="1.2.1 (9c6d41e410, 2022-07-14, custom)"
+   sodipodi:docname="xhc-hierarchy.svg"
+   inkscape:export-filename="../xhc-hierarchy.png"
+   inkscape:export-xdpi="300"
+   inkscape:export-ydpi="300"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview7"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="0.75290071"
+     inkscape:cx="286.22632"
+     inkscape:cy="274.93665"
+     inkscape:window-width="1920"
+     inkscape:window-height="1018"
+     inkscape:window-x="1920"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1"
+     inkscape:snap-global="false"
+     inkscape:showpageshadow="2"
+     inkscape:deskcolor="#d1d1d1" />
+  <defs
+     id="defs2">
+    <linearGradient
+       id="linearGradient12307"
+       inkscape:swatch="solid">
+      <stop
+         style="stop-color:#006d00;stop-opacity:1;"
+         offset="0"
+         id="stop12305" />
+    </linearGradient>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect576"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect572"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect568"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect564"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect560"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect556"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="TriangleStart"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="TriangleStart"
+       markerWidth="4.2600002"
+       markerHeight="4.9248557"
+       viewBox="0 0 5.3244081 6.1553851"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         transform="scale(0.5)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path135" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="TriangleStart_Fnone_S-000000"
+       refX="0"
+       refY="0"
+       orient="auto-start-reverse"
+       inkscape:stockid="TriangleStart"
+       markerWidth="4.2600002"
+       markerHeight="4.9248557"
+       viewBox="0 0 5.3244081 6.1553851"
+       inkscape:isstock="true"
+       inkscape:collect="always"
+       preserveAspectRatio="xMidYMid">
+      <path
+         transform="scale(0.5)"
+         style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path5624" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect3490"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect3486"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect1360"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect1448"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect1390"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect1328"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect441"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect2806"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect2682"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect2678"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect2379"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108773"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108767"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108562"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108388"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108136"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108050"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect107847"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect107761"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect107484"
+       is_visible="true"
+       lpeversion="1" />
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect95429"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="TriangleOutM"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true"
+       viewBox="0 0 4.2595265 4.9243081"
+       markerWidth="4.2600002"
+       markerHeight="4.9248552"
+       preserveAspectRatio="xMidYMid">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path62344" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="Arrow1Lstart"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="Arrow1Lstart"
+       inkscape:isstock="true">
+      <path
+         transform="matrix(0.8,0,0,0.8,10,0)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 0,0 5,-5 -12.5,0 5,5 Z"
+         id="path62199" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect62117"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="TriangleOutM-3"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path62344-6" />
+    </marker>
+    <marker
+       style="overflow:visible"
+       id="TriangleOutM-3_Fnone_S-000000"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.4)"
+         style="fill:#000000;fill-rule:evenodd;stroke:#000000;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path5627" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108050-7"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="marker1227"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true"
+       viewBox="0 0 4.2595265 4.9243081"
+       markerWidth="4.2595263"
+       markerHeight="4.9243083"
+       preserveAspectRatio="xMidYMid">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path1225" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108136-5"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="marker1232"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path1230" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108388-3"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="marker1237"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path1235" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108562-5"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="marker1242"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path1240" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108767-6"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="marker1247"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true"
+       viewBox="0 0 4.2595265 4.9243081"
+       markerWidth="4.2595263"
+       markerHeight="4.9243083"
+       preserveAspectRatio="xMidYMid">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path1245" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108773-2"
+       is_visible="true"
+       lpeversion="1" />
+    <marker
+       style="overflow:visible"
+       id="TriangleOutM-3-7"
+       refX="0"
+       refY="0"
+       orient="auto"
+       inkscape:stockid="TriangleOutM"
+       inkscape:isstock="true"
+       viewBox="0 0 4.2595265 4.9243081"
+       markerWidth="4.2595263"
+       markerHeight="4.9243083"
+       preserveAspectRatio="xMidYMid">
+      <path
+         transform="scale(0.4)"
+         style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:1pt"
+         d="M 5.77,0 -2.88,5 V -5 Z"
+         id="path62344-6-5" />
+    </marker>
+    <inkscape:path-effect
+       effect="spiro"
+       id="path-effect108767-6-3"
+       is_visible="true"
+       lpeversion="1" />
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-430.99854,-194.24567)">
+    <rect
+       style="fill:#ffffff;fill-opacity:1;stroke:none;stroke-width:0.992327"
+       id="rect351"
+       width="169.571"
+       height="119.894"
+       x="430.99854"
+       y="194.24567" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       x="523.56769"
+       y="295.47681"
+       id="text7266-3"><tspan
+         sodipodi:role="line"
+         id="tspan7264-6"
+         style="font-size:7.05556px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+         x="523.56769"
+         y="295.47681">NUMA Level</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.500001;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:2, 1;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+       id="rect29192-1"
+       width="117.44209"
+       height="33.103409"
+       x="465.46393"
+       y="232.30515"
+       ry="2"
+       rx="1.9999999" />
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       x="523.71375"
+       y="260.33276"
+       id="text37653-1"><tspan
+         sodipodi:role="line"
+         id="tspan37651-0"
+         style="font-size:7.05556px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+         x="523.71375"
+         y="260.33276">Socket Level</tspan></text>
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.4808;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:1.9232, 0.961599;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+       id="rect1048-9"
+       width="147.28586"
+       height="30.605822"
+       x="450.54153"
+       y="267.75421"
+       ry="1.8491039"
+       rx="2.0002611" />
+    <rect
+       style="fill:none;stroke:#000000;stroke-width:0.500001;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:2, 0.999982;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+       id="rect48770-3"
+       width="56.600784"
+       height="33.103409"
+       x="495.88324"
+       y="196.99588"
+       ry="2"
+       rx="1.9999996" />
+    <rect
+       style="font-variation-settings:normal;fill:#afafe9;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1928-1"
+       width="29.11949"
+       height="12.373198"
+       x="509.6239"
+       y="210.07121"
+       ry="2.4999995"
+       rx="2.5" />
+    <g
+       id="g742"
+       transform="translate(28.708569,28.979003)">
+      <rect
+         style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+         id="rect1930-5"
+         width="5.7350011"
+         height="5.7350011"
+         x="486.96146"
+         y="184.41132" />
+      <rect
+         style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+         id="rect1932-4"
+         width="5.7350011"
+         height="5.7350011"
+         x="498.25366"
+         y="184.41132" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:7.05556px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       x="524.31555"
+       y="205.65865"
+       id="text53597-7"><tspan
+         sodipodi:role="line"
+         id="tspan53595-6"
+         style="font-size:7.05556px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+         x="524.31555"
+         y="205.65865">System Level</tspan></text>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.494089;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
+       d="M 561.29231,236.42783 530.28249,217.01021"
+       id="path108384-9"
+       inkscape:path-effect="#path-effect108388-3"
+       inkscape:original-d="m 561.29231,236.42783 c -10.38789,-6.52565 -20.67275,-12.94489 -31.00982,-19.41762" />
+    <rect
+       style="font-variation-settings:normal;fill:#87cdde;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1248-1"
+       width="29.11949"
+       height="12.373198"
+       x="472.74966"
+       y="238.28702"
+       ry="2.4999995"
+       rx="2.5" />
+    <rect
+       style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1541-4"
+       width="5.7350011"
+       height="5.7350011"
+       x="478.79581"
+       y="241.60611" />
+    <rect
+       style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1857-7"
+       width="5.7350011"
+       height="5.7350011"
+       x="490.08801"
+       y="241.60611" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.489502;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
+       d="M 471.54671,271.46782 481.70372,244.9286"
+       id="path108763-7"
+       inkscape:path-effect="#path-effect108767-6"
+       inkscape:original-d="m 471.54671,271.46782 c 3.40244,-8.91899 6.77119,-17.69255 10.15701,-26.53922" />
+    <rect
+       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect12758-0"
+       width="29.118999"
+       height="12.373198"
+       x="456.87491"
+       y="273.43073"
+       ry="2.4999995"
+       rx="2.5" />
+    <rect
+       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect2265"
+       width="29.118999"
+       height="12.373198"
+       x="488.62491"
+       y="273.43073"
+       ry="2.4999995"
+       rx="2.5" />
+    <g
+       id="g2275"
+       transform="translate(76.684113,22.629088)">
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2267"
+         cy="258.04657"
+         cx="423.40134"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2269"
+         cy="258.04657"
+         cx="429.5993"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:none;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2271"
+         cy="258.04657"
+         cx="417.20334"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2273"
+         cy="258.04657"
+         cx="435.79727"
+         rx="0.5"
+         ry="0.50640655" />
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.490156;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
+       d="M 502.77673,271.53882 492.86064,244.688"
+       id="path2375"
+       inkscape:path-effect="#path-effect2379"
+       inkscape:original-d="m 502.77673,271.53882 c -3.32175,-9.0237 -6.61058,-17.90027 -9.91609,-26.85082" />
+    <rect
+       style="font-variation-settings:normal;fill:#87cdde;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1916-5"
+       width="29.11949"
+       height="12.373198"
+       x="546.49811"
+       y="237.65749"
+       ry="2.4999995"
+       rx="2.5" />
+    <rect
+       style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1918-3"
+       width="5.7350011"
+       height="5.7350011"
+       x="552.54425"
+       y="240.97658" />
+    <rect
+       style="fill:#ffffff;stroke:#000001;stroke-width:0.264999;stroke-linejoin:round;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect1920-6"
+       width="5.7350011"
+       height="5.7350011"
+       x="563.83643"
+       y="240.97658" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.490799;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
+       d="M 545.30179,271.61213 555.401,244.23858"
+       id="path2640"
+       inkscape:path-effect="#path-effect2678"
+       inkscape:original-d="m 545.30179,271.61213 c 3.38308,-9.19938 6.73266,-18.24875 10.09921,-27.37355" />
+    <rect
+       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect2642"
+       width="29.118999"
+       height="12.373198"
+       x="530.62335"
+       y="273.43073"
+       ry="2.4999995"
+       rx="2.5" />
+    <g
+       id="g2652"
+       transform="translate(118.68254,22.629088)">
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2644"
+         cy="258.04657"
+         cx="423.40134"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2646"
+         cy="258.04657"
+         cx="429.5993"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:none;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2648"
+         cy="258.04657"
+         cx="417.20334"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2650"
+         cy="258.04657"
+         cx="435.79727"
+         rx="0.5"
+         ry="0.50640655" />
+    </g>
+    <rect
+       style="font-variation-settings:normal;fill:#ffeeaa;fill-opacity:1;stroke:#000000;stroke-width:0.753;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000"
+       id="rect2656"
+       width="29.118999"
+       height="12.373198"
+       x="562.37335"
+       y="273.43073"
+       ry="2.4999995"
+       rx="2.5" />
+    <g
+       id="g2666"
+       transform="translate(150.43255,22.629088)">
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2658"
+         cy="258.04657"
+         cx="423.40134"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2660"
+         cy="258.04657"
+         cx="429.5993"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:none;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2662"
+         cy="258.04657"
+         cx="417.20334"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse2664"
+         cy="258.04657"
+         cx="435.79727"
+         rx="0.5"
+         ry="0.50640655" />
+    </g>
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.490838;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
+       d="m 576.51676,271.60662 -9.73113,-27.59994"
+       id="path2672"
+       inkscape:path-effect="#path-effect2682"
+       inkscape:original-d="m 576.51676,271.60662 c -3.25979,-9.27547 -6.48726,-18.39969 -9.73113,-27.59994" />
+    <path
+       style="fill:none;stroke:#000000;stroke-width:0.490715;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleOutM-3_Fnone_S-000000)"
+       d="m 487.56018,236.95873 30.3709,-19.72977"
+       id="path2802"
+       inkscape:path-effect="#path-effect2806"
+       inkscape:original-d="m 487.56018,236.95873 c 10.17386,-6.63057 20.2468,-13.15301 30.3709,-19.72977" />
+    <text
+       xml:space="preserve"
+       style="font-size:6.35px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+       x="-165.23523"
+       y="497.55908"
+       id="text270"
+       transform="rotate(-75)"><tspan
+         sodipodi:role="line"
+         id="tspan268"
+         style="font-size:6.35px;stroke-width:0.264583"
+         x="-165.23523"
+         y="497.55908">Cores</tspan></text>
+    <g
+       id="g9766"
+       transform="matrix(0,1.4312737,-0.77511095,0,565.75587,22.73314)"
+       style="stroke-width:0.474708;stroke-dasharray:none">
+      <g
+         id="g3745">
+        <g
+           id="g3749"
+           transform="matrix(0,1.8465404,-0.3519523,0,241.33077,-180.72934)"
+           style="stroke-width:0.588851">
+          <path
+             style="fill:none;stroke:#000000;stroke-width:0.588851;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1"
+             d="m 179.75303,159.69653 c 4.27425,0 -0.12961,15.95005 4.29826,16.04131"
+             id="path9169"
+             sodipodi:nodetypes="cc" />
+          <path
+             style="fill:none;stroke:#000000;stroke-width:0.588851;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1"
+             d="m 179.75303,191.77915 c 4.27425,0 -0.12961,-15.95005 4.29826,-16.04131"
+             id="path9169-7"
+             sodipodi:nodetypes="cc" />
+        </g>
+      </g>
+    </g>
+    <g
+       id="g14573"
+       transform="matrix(-1,0,0,1,924.11737,0)">
+      <text
+         xml:space="preserve"
+         style="font-size:5.64444px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+         x="-447.51492"
+         y="305.93857"
+         id="text656"
+         transform="scale(-1,1)"><tspan
+           sodipodi:role="line"
+           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           x="-447.51492"
+           y="305.93857"
+           id="tspan1416">NUMA 0</tspan><tspan
+           sodipodi:role="line"
+           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           x="-447.51492"
+           y="311.58301"
+           id="tspan2490">Leader</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000)"
+         d="m 462.02562,283.92066 v 20.70002 l -2.80093,2.32144"
+         id="path3546"
+         sodipodi:nodetypes="ccc" />
+    </g>
+    <g
+       id="g2167"
+       transform="translate(44.916471,22.629088)">
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="circle12748-9"
+         cy="258.04657"
+         cx="423.40134"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="circle12750-3"
+         cy="258.04657"
+         cx="429.5993"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:none;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="circle12752-6"
+         cy="258.04657"
+         cx="417.20334"
+         rx="0.5"
+         ry="0.50640655" />
+      <ellipse
+         style="fill:#afe9dd;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke"
+         id="circle12754-0"
+         cy="258.04657"
+         cx="435.79727"
+         rx="0.5"
+         ry="0.50640655" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="459.90964"
+       y="278.60001"
+       id="text1273"><tspan
+         sodipodi:role="line"
+         id="tspan1271"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="459.90964"
+         y="278.60001">P0</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="466.32285"
+       y="278.59824"
+       id="text1327"><tspan
+         sodipodi:role="line"
+         id="tspan1325"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="466.32285"
+         y="278.59824">P1</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="472.31091"
+       y="278.59824"
+       id="text1331"><tspan
+         sodipodi:role="line"
+         id="tspan1329"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="472.31091"
+         y="278.59824">P2</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="478.46829"
+       y="278.59824"
+       id="text1335"><tspan
+         sodipodi:role="line"
+         id="tspan1333"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="478.46829"
+         y="278.59824">P3</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="491.64203"
+       y="278.60001"
+       id="text1504"><tspan
+         sodipodi:role="line"
+         id="tspan1502"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="491.64203"
+         y="278.60001">P4</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="497.89117"
+       y="278.59824"
+       id="text1508"><tspan
+         sodipodi:role="line"
+         id="tspan1506"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="497.89117"
+         y="278.59824">P5</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="504.06619"
+       y="278.59824"
+       id="text1512"><tspan
+         sodipodi:role="line"
+         id="tspan1510"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="504.06619"
+         y="278.59824">P6</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="510.26416"
+       y="278.59824"
+       id="text1516"><tspan
+         sodipodi:role="line"
+         id="tspan1514"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="510.26416"
+         y="278.59824">P7</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="533.67395"
+       y="278.60004"
+       id="text1522"><tspan
+         sodipodi:role="line"
+         id="tspan1520"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="533.67395"
+         y="278.60004">P8</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="539.87195"
+       y="278.59827"
+       id="text1526"><tspan
+         sodipodi:role="line"
+         id="tspan1524"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="539.87195"
+         y="278.59827">P9</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="546.22516"
+       y="278.59827"
+       id="text1530"><tspan
+         sodipodi:role="line"
+         id="tspan1528"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="546.22516"
+         y="278.59827">10</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="552.63831"
+       y="278.59827"
+       id="text1534"><tspan
+         sodipodi:role="line"
+         id="tspan1532"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="552.63831"
+         y="278.59827">11</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="565.57214"
+       y="278.60004"
+       id="text1548"><tspan
+         sodipodi:role="line"
+         id="tspan1546"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="565.57214"
+         y="278.60004">12</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="571.78247"
+       y="278.59827"
+       id="text1552"><tspan
+         sodipodi:role="line"
+         id="tspan1550"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="571.78247"
+         y="278.59827">13</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="577.93988"
+       y="278.59827"
+       id="text1556"><tspan
+         sodipodi:role="line"
+         id="tspan1554"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="577.93988"
+         y="278.59827">14</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="584.18903"
+       y="278.59827"
+       id="text1560"><tspan
+         sodipodi:role="line"
+         id="tspan1558"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+         x="584.18903"
+         y="278.59827">15</tspan></text>
+    <g
+       id="g8037"
+       transform="matrix(-1,0,0,1,944.89717,-0.52916667)">
+      <g
+         id="g7602">
+        <text
+           xml:space="preserve"
+           style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+           x="-475.6575"
+           y="257.66113"
+           id="text7420"
+           transform="scale(-1,1)"><tspan
+             sodipodi:role="line"
+             id="tspan7418"
+             style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+             x="-475.6575"
+             y="257.66113">P0</tspan></text>
+        <path
+           style="fill:none;stroke:#000000;stroke-width:0.365;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleStart_Fnone_S-000000)"
+           d="m 470.86331,258.62414 -1.74113,-4.07931"
+           id="path7434" />
+      </g>
+    </g>
+    <g
+       id="g7632"
+       transform="translate(84.666671,0.52917333)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="471.76636"
+         y="257.13196"
+         id="text7628"><tspan
+           sodipodi:role="line"
+           id="tspan7626"
+           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+           x="471.76636"
+           y="257.13196">P8</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.365;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleStart_Fnone_S-000000)"
+         d="m 469.19586,257.5658 1.74113,-4.07931"
+         id="path7630" />
+    </g>
+    <g
+       id="g8800"
+       transform="translate(0,0.52917333)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="574.71814"
+         y="256.47675"
+         id="text7636"><tspan
+           sodipodi:role="line"
+           id="tspan7634"
+           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+           x="574.71814"
+           y="256.47675">P12</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.365;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleStart_Fnone_S-000000)"
+         d="m 574.78426,257.5658 -1.74113,-4.07931"
+         id="path7638" />
+    </g>
+    <g
+       id="g7666"
+       transform="translate(-83.60834)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="571.57312"
+         y="257.65936"
+         id="text7662"><tspan
+           sodipodi:role="line"
+           id="tspan7660"
+           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+           x="571.57312"
+           y="257.65936">P4</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.365;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleStart_Fnone_S-000000)"
+         d="m 578.30495,258.09497 -1.74113,-4.07931"
+         id="path7664" />
+    </g>
+    <g
+       id="g8059"
+       transform="rotate(180,501.4769,223.23715)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="-503.70346"
+         y="-224.43806"
+         id="text8041"
+         transform="scale(-1)"><tspan
+           sodipodi:role="line"
+           id="tspan8039"
+           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+           x="-503.70346"
+           y="-224.43806">P0</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.365;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleStart_Fnone_S-000000)"
+         d="m 497.88046,224.80497 3.61184,-2.57427"
+         id="path8043" />
+    </g>
+    <g
+       id="g13977"
+       transform="matrix(1,0,0,-1,45.394312,446.4743)">
+      <text
+         xml:space="preserve"
+         style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+         x="499.28314"
+         y="-224.43806"
+         id="text13973"
+         transform="scale(1,-1)"><tspan
+           sodipodi:role="line"
+           id="tspan13971"
+           style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-size:3.52778px;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.264583"
+           x="499.28314"
+           y="-224.43806">P8</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.365;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000);marker-end:url(#TriangleStart_Fnone_S-000000)"
+         d="m 497.88046,224.80497 3.61184,-2.57427"
+         id="path13975" />
+    </g>
+    <g
+       id="g554"
+       transform="matrix(-1,0,0,1,955.86739,0)">
+      <text
+         xml:space="preserve"
+         style="font-size:5.64444px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+         x="-447.51492"
+         y="305.93857"
+         id="text550"
+         transform="scale(-1,1)"><tspan
+           sodipodi:role="line"
+           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           x="-447.51492"
+           y="305.93857"
+           id="tspan546">NUMA 1</tspan><tspan
+           sodipodi:role="line"
+           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           x="-447.51492"
+           y="311.58301"
+           id="tspan548">Leader</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000)"
+         d="m 462.02562,283.92066 v 20.70002 l -2.80093,2.32144"
+         id="path552"
+         sodipodi:nodetypes="ccc" />
+    </g>
+    <g
+       id="g666"
+       transform="matrix(-1,0,0,1,1029.9509,0)">
+      <text
+         xml:space="preserve"
+         style="font-size:5.64444px;line-height:1;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;stroke-width:0.264583"
+         x="-447.51492"
+         y="305.93857"
+         id="text662"
+         transform="scale(-1,1)"><tspan
+           sodipodi:role="line"
+           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           x="-447.51492"
+           y="305.93857"
+           id="tspan658">NUMA 3</tspan><tspan
+           sodipodi:role="line"
+           style="font-size:5.64444px;text-align:center;text-anchor:middle;stroke-width:0.264583"
+           x="-447.51492"
+           y="311.58301"
+           id="tspan660">Leader</tspan></text>
+      <path
+         style="fill:none;stroke:#000000;stroke-width:0.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-start:url(#TriangleStart_Fnone_S-000000)"
+         d="m 462.02562,283.92066 v 20.70002 l -2.80093,2.32144"
+         id="path664"
+         sodipodi:nodetypes="ccc" />
+    </g>
+  </g>
+</svg>

From 7b9e74c9d4c02a4b70126a44e1898cff3163e924 Mon Sep 17 00:00:00 2001
From: George Katevenis <gkatev@ics.forth.gr>
Date: Wed, 15 Feb 2023 09:25:59 +0200
Subject: [PATCH 2/2] coll/HAN: Add support for XHC on the intra-comm

Signed-off-by: George Katevenis <gkatev@ics.forth.gr>
---
 ompi/mca/coll/han/coll_han.h           |  6 ++++--
 ompi/mca/coll/han/coll_han_component.c | 17 ++++++++++-------
 ompi/mca/coll/han/coll_han_dynamic.h   |  3 +++
 ompi/mca/coll/han/coll_han_subcomms.c  |  6 ++++++
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h
index de4018bec22..4e5323fc046 100644
--- a/ompi/mca/coll/han/coll_han.h
+++ b/ompi/mca/coll/han/coll_han.h
@@ -6,6 +6,8 @@
  * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved.
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All rights reserved.
+ * Copyright (c) 2023      Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  * $COPYRIGHT$
  *
@@ -47,11 +49,11 @@
 
 /*
  * Today;
- * . only 2 modules available for intranode (low) level
+ * . 3 modules available for intranode (low) level
  * . only 2 modules available for internode (up) level
  */
 
-#define COLL_HAN_LOW_MODULES 2
+#define COLL_HAN_LOW_MODULES 3
 #define COLL_HAN_UP_MODULES 2
 
 struct mca_coll_han_bcast_args_s {
diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c
index ed9582d5ffe..6ce8f5a06a8 100644
--- a/ompi/mca/coll/han/coll_han_component.c
+++ b/ompi/mca/coll/han/coll_han_component.c
@@ -4,6 +4,8 @@
  *                         reserved.
  * Copyright (c) 2022      IBM Corporation. All rights reserved
  * Copyright (c) 2020-2022 Bull S.A.S. All rights reserved.
+ * Copyright (c) 2023      Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -43,7 +45,8 @@ ompi_coll_han_components ompi_coll_han_available_components[COMPONENTS_COUNT] =
     { TUNED, "tuned" },
     { SM, "sm" },  /* this should not be used, the collective component is gone */
     { ADAPT, "adapt" },
-    { HAN, "han" }
+    { HAN, "han" },
+    { XHC, "xhc" }
 };
 
 /*
@@ -287,7 +290,7 @@ static int han_register(void)
 
     cs->han_bcast_low_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "bcast_low_module",
-                                              "low level module for bcast, currently only 0 for tuned",
+                                              "low level module for bcast, 0 tuned, 2 xhc",
                                               OPAL_INFO_LVL_9,
                                               &cs->han_bcast_low_module,
                                               &cs->han_op_module_name.bcast.han_op_low_module_name);
@@ -307,7 +310,7 @@ static int han_register(void)
 
     cs->han_reduce_low_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "reduce_low_module",
-                                              "low level module for allreduce, currently only 0 tuned",
+                                              "low level module for allreduce, 0 tuned, 2 xhc",
                                               OPAL_INFO_LVL_9, &cs->han_reduce_low_module,
                                               &cs->han_op_module_name.reduce.han_op_low_module_name);
 
@@ -326,7 +329,7 @@ static int han_register(void)
 
     cs->han_allreduce_low_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "allreduce_low_module",
-                                              "low level module for allreduce, currently only 0 tuned",
+                                              "low level module for allreduce, 0 tuned, 2 xhc",
                                               OPAL_INFO_LVL_9, &cs->han_allreduce_low_module,
                                               &cs->han_op_module_name.allreduce.han_op_low_module_name);
 
@@ -338,7 +341,7 @@ static int han_register(void)
 
     cs->han_allgather_low_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "allgather_low_module",
-                                              "low level module for allgather, currently only 0 tuned",
+                                              "low level module for allgather, 0 tuned, 2 xhc",
                                               OPAL_INFO_LVL_9, &cs->han_allgather_low_module,
                                               &cs->han_op_module_name.allgather.han_op_low_module_name);
 
@@ -350,7 +353,7 @@ static int han_register(void)
 
     cs->han_gather_low_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "gather_low_module",
-                                              "low level module for gather, currently only 0 tuned",
+                                              "low level module for gather, 0 tuned, 2 xhc",
                                               OPAL_INFO_LVL_9, &cs->han_gather_low_module,
                                               &cs->han_op_module_name.gather.han_op_low_module_name);
 
@@ -374,7 +377,7 @@ static int han_register(void)
 
     cs->han_scatter_low_module = 0;
     (void) mca_coll_han_query_module_from_mca(c, "scatter_low_module",
-                                              "low level module for scatter, currently only 0 tuned",
+                                              "low level module for scatter, 0 tuned, 2 xhc",
                                               OPAL_INFO_LVL_9, &cs->han_scatter_low_module,
                                               &cs->han_op_module_name.scatter.han_op_low_module_name);
 
diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h
index 403e458391e..82114227308 100644
--- a/ompi/mca/coll/han/coll_han_dynamic.h
+++ b/ompi/mca/coll/han/coll_han_dynamic.h
@@ -5,6 +5,8 @@
  *                         reserved.
  * Copyright (c) 2020      Bull S.A.S. All rights reserved.
  * Copyright (c) 2022      IBM Corporation. All rights reserved
+ * Copyright (c) 2023      Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
  *
  * $COPYRIGHT$
  *
@@ -105,6 +107,7 @@ typedef enum COMPONENTS {
     SM,
     ADAPT,
     HAN,
+    XHC,
     COMPONENTS_COUNT
 } COMPONENT_T;
 
diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c
index d1330188f41..92bddb3ba51 100644
--- a/ompi/mca/coll/han/coll_han_subcomms.c
+++ b/ompi/mca/coll/han/coll_han_subcomms.c
@@ -3,6 +3,8 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2020      Bull S.A.S. All rights reserved.
+ * Copyright (c) 2023      Computer Architecture and VLSI Systems (CARV)
+ *                         Laboratory, ICS Forth. All rights reserved.
  *
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  * $COPYRIGHT$
@@ -314,6 +316,10 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm,
                          &comm_info, &(low_comms[1]));
     assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[1]) && !OMPI_COMM_IS_DISJOINT(low_comms[1]));
 
+    opal_info_set(&comm_info, "ompi_comm_coll_preference", "xhc,^han");
+    ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
+                         &comm_info, &(low_comms[2]));
+
     /*
      * Upgrade libnbc module priority to set up up_comms[0] with libnbc module
      * This sub-communicator contains one process per node: processes with the