Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New collectives component: XPMEM-based Hierarchical Collectives (XHC) #11418

Merged
merged 2 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 4 additions & 2 deletions ompi/mca/coll/han/coll_han.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
* Copyright (c) 2020-2022 Bull S.A.S. All rights reserved.
* Copyright (c) Amazon.com, Inc. or its affiliates.
* All rights reserved.
* Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV)
* Laboratory, ICS Forth. All rights reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -47,11 +49,11 @@

/*
* Today;
* . only 2 modules available for intranode (low) level
* . 3 modules available for intranode (low) level
* . only 2 modules available for internode (up) level
*/

#define COLL_HAN_LOW_MODULES 2
#define COLL_HAN_LOW_MODULES 3
#define COLL_HAN_UP_MODULES 2

struct mca_coll_han_bcast_args_s {
Expand Down
17 changes: 10 additions & 7 deletions ompi/mca/coll/han/coll_han_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved
* Copyright (c) 2020-2022 Bull S.A.S. All rights reserved.
* Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV)
* Laboratory, ICS Forth. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -43,7 +45,8 @@ ompi_coll_han_components ompi_coll_han_available_components[COMPONENTS_COUNT] =
{ TUNED, "tuned" },
{ SM, "sm" }, /* this should not be used, the collective component is gone */
{ ADAPT, "adapt" },
{ HAN, "han" }
{ HAN, "han" },
{ XHC, "xhc" }
};

/*
Expand Down Expand Up @@ -287,7 +290,7 @@ static int han_register(void)

cs->han_bcast_low_module = 0;
(void) mca_coll_han_query_module_from_mca(c, "bcast_low_module",
"low level module for bcast, currently only 0 for tuned",
"low level module for bcast, 0 tuned, 2 xhc",
OPAL_INFO_LVL_9,
&cs->han_bcast_low_module,
&cs->han_op_module_name.bcast.han_op_low_module_name);
Expand All @@ -307,7 +310,7 @@ static int han_register(void)

cs->han_reduce_low_module = 0;
(void) mca_coll_han_query_module_from_mca(c, "reduce_low_module",
"low level module for allreduce, currently only 0 tuned",
"low level module for allreduce, 0 tuned, 2 xhc",
OPAL_INFO_LVL_9, &cs->han_reduce_low_module,
&cs->han_op_module_name.reduce.han_op_low_module_name);

Expand All @@ -326,7 +329,7 @@ static int han_register(void)

cs->han_allreduce_low_module = 0;
(void) mca_coll_han_query_module_from_mca(c, "allreduce_low_module",
"low level module for allreduce, currently only 0 tuned",
"low level module for allreduce, 0 tuned, 2 xhc",
OPAL_INFO_LVL_9, &cs->han_allreduce_low_module,
&cs->han_op_module_name.allreduce.han_op_low_module_name);

Expand All @@ -338,7 +341,7 @@ static int han_register(void)

cs->han_allgather_low_module = 0;
(void) mca_coll_han_query_module_from_mca(c, "allgather_low_module",
"low level module for allgather, currently only 0 tuned",
"low level module for allgather, 0 tuned, 2 xhc",
OPAL_INFO_LVL_9, &cs->han_allgather_low_module,
&cs->han_op_module_name.allgather.han_op_low_module_name);

Expand All @@ -350,7 +353,7 @@ static int han_register(void)

cs->han_gather_low_module = 0;
(void) mca_coll_han_query_module_from_mca(c, "gather_low_module",
"low level module for gather, currently only 0 tuned",
"low level module for gather, 0 tuned, 2 xhc",
OPAL_INFO_LVL_9, &cs->han_gather_low_module,
&cs->han_op_module_name.gather.han_op_low_module_name);

Expand All @@ -374,7 +377,7 @@ static int han_register(void)

cs->han_scatter_low_module = 0;
(void) mca_coll_han_query_module_from_mca(c, "scatter_low_module",
"low level module for scatter, currently only 0 tuned",
"low level module for scatter, 0 tuned, 2 xhc",
OPAL_INFO_LVL_9, &cs->han_scatter_low_module,
&cs->han_op_module_name.scatter.han_op_low_module_name);

Expand Down
3 changes: 3 additions & 0 deletions ompi/mca/coll/han/coll_han_dynamic.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved
* Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV)
* Laboratory, ICS Forth. All rights reserved.
*
* $COPYRIGHT$
*
Expand Down Expand Up @@ -105,6 +107,7 @@ typedef enum COMPONENTS {
SM,
ADAPT,
HAN,
XHC,
COMPONENTS_COUNT
} COMPONENT_T;

Expand Down
6 changes: 6 additions & 0 deletions ompi/mca/coll/han/coll_han_subcomms.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Bull S.A.S. All rights reserved.
* Copyright (c) 2023 Computer Architecture and VLSI Systems (CARV)
* Laboratory, ICS Forth. All rights reserved.
*
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -314,6 +316,10 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm,
&comm_info, &(low_comms[1]));
assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[1]) && !OMPI_COMM_IS_DISJOINT(low_comms[1]));

opal_info_set(&comm_info, "ompi_comm_coll_preference", "xhc,^han");
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
&comm_info, &(low_comms[2]));

/*
* Upgrade libnbc module priority to set up up_comms[0] with libnbc module
* This sub-communicator contains one process per node: processes with the
Expand Down
44 changes: 44 additions & 0 deletions ompi/mca/coll/xhc/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#
# Copyright (c) 2021-2023 Computer Architecture and VLSI Systems (CARV)
# Laboratory, ICS Forth. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

dist_opaldata_DATA = help-coll-xhc.txt

sources = \
coll_xhc.h \
coll_xhc_atomic.h \
coll_xhc.c \
coll_xhc_component.c \
coll_xhc_module.c \
coll_xhc_bcast.c \
coll_xhc_barrier.c \
coll_xhc_reduce.c \
coll_xhc_allreduce.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

component_noinst =
component_install =
if MCA_BUILD_ompi_coll_xhc_DSO
component_install += mca_coll_xhc.la
else
component_noinst += libmca_coll_xhc.la
endif

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_xhc_la_SOURCES = $(sources)
mca_coll_xhc_la_LDFLAGS = -module -avoid-version
mca_coll_xhc_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la

noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_xhc_la_SOURCES = $(sources)
libmca_coll_xhc_la_LDFLAGS = -module -avoid-version