Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ompi/coll/mccl #7409

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions config/ompi_check_mccl.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2020 Mellanox Technologies. All rights reserved.
dnl Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2015 Research Organization for Information Science
dnl and Technology (RIST). All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl

# OMPI_CHECK_MCCL(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if mccl support can be found. sets prefix_{CPPFLAGS,
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_MCCL],[
OPAL_VAR_SCOPE_PUSH([ompi_check_mccl_dir ompi_check_mccl_libs ompi_check_mccl_happy CPPFLAGS_save LDFLAGS_save LIBS_save])

AC_ARG_WITH([mccl],
[AC_HELP_STRING([--with-mccl(=DIR)],
[Build mccl (Mellanox Collective Communication library) support, optionally adding
DIR/include and DIR/lib or DIR/lib64 to the search path for headers and libraries])])

AS_IF([test "$with_mccl" != "no"],
[ompi_check_mccl_libs=mccl
AS_IF([test ! -z "$with_mccl" && test "$with_mccl" != "yes"],
[ompi_check_mccl_dir=$with_mccl])

CPPFLAGS_save=$CPPFLAGS
LDFLAGS_save=$LDFLAGS
LIBS_save=$LIBS

OPAL_LOG_MSG([$1_CPPFLAGS : $$1_CPPFLAGS], 1)
OPAL_LOG_MSG([$1_LDFLAGS : $$1_LDFLAGS], 1)
OPAL_LOG_MSG([$1_LIBS : $$1_LIBS], 1)

OPAL_CHECK_PACKAGE([$1],
[api/mccl.h],
[$ompi_check_mccl_libs],
[mccl_init_context],
[],
[$ompi_check_mccl_dir],
[],
[ompi_check_mccl_happy="yes"],
[ompi_check_mccl_happy="no"])

AS_IF([test "$ompi_check_mccl_happy" = "yes"],
[
CPPFLAGS=$coll_mccl_CPPFLAGS
LDFLAGS=$coll_mccl_LDFLAGS
LIBS=$coll_mccl_LIBS
AC_CHECK_FUNCS(mccl_comm_free, [], [])
],
[])

CPPFLAGS=$CPPFLAGS_save
LDFLAGS=$LDFLAGS_save
LIBS=$LIBS_save],
[ompi_check_mccl_happy=no])

AS_IF([test "$ompi_check_mccl_happy" = "yes" && test "$enable_progress_threads" = "yes"],
[AC_MSG_WARN([mccl driver does not currently support progress threads. Disabling MCCL.])
ompi_check_mccl_happy="no"])

AS_IF([test "$ompi_check_mccl_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_mccl" && test "$with_mccl" != "no"],
[AC_MSG_ERROR([MCCL support requested but not found. Aborting])])
$3])

OPAL_VAR_SCOPE_POP
])
4 changes: 3 additions & 1 deletion ompi/mca/coll/base/coll_tags.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
#define MCA_COLL_BASE_TAG_SCAN -24
#define MCA_COLL_BASE_TAG_SCATTER -25
#define MCA_COLL_BASE_TAG_SCATTERV -26
#define MCA_COLL_BASE_TAG_NONBLOCKING_BASE -27
#define MCA_COLL_BASE_TAG_MCCL -27

#define MCA_COLL_BASE_TAG_NONBLOCKING_BASE -28
#define MCA_COLL_BASE_TAG_NONBLOCKING_END ((-1 * INT_MAX/2) + 1)
#define MCA_COLL_BASE_TAG_NEIGHBOR_BASE (MCA_COLL_BASE_TAG_NONBLOCKING_END - 1)
#define MCA_COLL_BASE_TAG_NEIGHBOR_END (MCA_COLL_BASE_TAG_NEIGHBOR_BASE - 1024)
Expand Down
46 changes: 46 additions & 0 deletions ompi/mca/coll/mccl/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- shell-script -*-
#
#
# Copyright (c) 2020 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#

AM_CPPFLAGS = $(coll_mccl_CPPFLAGS)

coll_mccl_sources = \
coll_mccl.h \
coll_mccl_debug.h \
coll_mccl_dtypes.h \
coll_mccl_module.c \
coll_mccl_component.c \
coll_mccl_ops.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_ompi_coll_mccl_DSO
component_noinst =
component_install = mca_coll_mccl.la
else
component_noinst = libmca_coll_mccl.la
component_install =
endif

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_mccl_la_SOURCES = $(coll_mccl_sources)
mca_coll_mccl_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(coll_mccl_LIBS)
mca_coll_mccl_la_LDFLAGS = -module -avoid-version $(coll_mccl_LDFLAGS)

noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_mccl_la_SOURCES = $(coll_mccl_sources)
libmca_coll_mccl_la_LIBADD = $(coll_mccl_LIBS)
libmca_coll_mccl_la_LDFLAGS = -module -avoid-version $(coll_mccl_LDFLAGS)

103 changes: 103 additions & 0 deletions ompi/mca/coll/mccl/coll_mccl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/**
Copyright (c) 2020 Mellanox Technologies. All rights reserved.
$COPYRIGHT$

Additional copyrights may follow

$HEADER$
*/

#ifndef MCA_COLL_MCCL_H
#define MCA_COLL_MCCL_H

#include "ompi_config.h"

#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/memoryhooks/memory.h"
#include "opal/mca/memory/base/base.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/communicator/communicator.h"
#include "ompi/attribute/attribute.h"
#include "ompi/op/op.h"

#include "orte/runtime/orte_globals.h"

#include "api/mccl.h"

#include "coll_mccl_debug.h"
#ifndef MCCL_VERSION
#define MCCL_VERSION(major, minor) (((major)<<MCCL_MAJOR_BIT)|((minor)<<MCCL_MINOR_BIT))
#endif
BEGIN_C_DECLS

struct mca_coll_mccl_component_t {
/** Base coll component */
mca_coll_base_component_2_0_0_t super;

/** MCA parameter: Priority of this component */
int mccl_priority;

/** MCA parameter: Verbose level of this component */
int mccl_verbose;

/** MCA parameter: Enable MCCL */
int mccl_enable;

/** r/o MCA parameter: libmccl compiletime version */
char* compiletime_version;

/** r/o MCA parameter: libmccl runtime version */
const char* runtime_version;

/** MCA parameter: Minimal number of processes in the communicator
for the corresponding mccl context to be created */
int mccl_np;

/** Whether or not mccl_init was ever called */
bool libmccl_initialized;
mccl_context_h mccl_context;
opal_free_list_t requests;
};
typedef struct mca_coll_mccl_component_t mca_coll_mccl_component_t;

OMPI_MODULE_DECLSPEC extern mca_coll_mccl_component_t mca_coll_mccl_component;

/**
* MCCL enabled communicator
*/
struct mca_coll_mccl_module_t {
mca_coll_base_module_t super;
ompi_communicator_t* comm;
int rank;
mccl_comm_h mccl_comm;
mca_coll_base_module_allreduce_fn_t previous_allreduce;
mca_coll_base_module_t* previous_allreduce_module;
mca_coll_base_module_barrier_fn_t previous_barrier;
mca_coll_base_module_t* previous_barrier_module;
mca_coll_base_module_bcast_fn_t previous_bcast;
mca_coll_base_module_t* previous_bcast_module;
};
typedef struct mca_coll_mccl_module_t mca_coll_mccl_module_t;
OBJ_CLASS_DECLARATION(mca_coll_mccl_module_t);



int mca_coll_mccl_init_query(bool enable_progress_threads, bool enable_mpi_threads);
mca_coll_base_module_t *mca_coll_mccl_comm_query(struct ompi_communicator_t *comm, int *priority);


int mca_coll_mccl_allreduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_mccl_barrier(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_mccl_bcast(void *buf, int count, struct ompi_datatype_t *dtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

END_C_DECLS
#endif
Loading