Skip to content

Commit

Permalink
ompi/coll/mccl
Browse files Browse the repository at this point in the history
    Mellanox collective communication library (MCCL) integration layer

Signed-off-by: Valentin Petrov <valentinp@mellanox.com>
  • Loading branch information
Valentin Petrov committed Feb 17, 2020
1 parent 1275766 commit e2222fd
Show file tree
Hide file tree
Showing 10 changed files with 924 additions and 1 deletion.
75 changes: 75 additions & 0 deletions config/ompi_check_mccl.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
dnl -*- shell-script -*-
dnl
dnl Copyright (c) 2020 Mellanox Technologies. All rights reserved.
dnl Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2015 Research Organization for Information Science
dnl and Technology (RIST). All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
dnl
dnl $HEADER$
dnl

# OMPI_CHECK_MCCL(prefix, [action-if-found], [action-if-not-found])
# --------------------------------------------------------
# check if mccl support can be found. sets prefix_{CPPFLAGS,
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
# support, otherwise executes action-if-not-found
AC_DEFUN([OMPI_CHECK_MCCL],[
OPAL_VAR_SCOPE_PUSH([ompi_check_mccl_dir ompi_check_mccl_libs ompi_check_mccl_happy CPPFLAGS_save LDFLAGS_save LIBS_save])

AC_ARG_WITH([mccl],
[AC_HELP_STRING([--with-mccl(=DIR)],
[Build mccl (Mellanox Collective Communication library) support, optionally adding
DIR/include and DIR/lib or DIR/lib64 to the search path for headers and libraries])])

AS_IF([test "$with_mccl" != "no"],
[ompi_check_mccl_libs=mccl
AS_IF([test ! -z "$with_mccl" && test "$with_mccl" != "yes"],
[ompi_check_mccl_dir=$with_mccl])

CPPFLAGS_save=$CPPFLAGS
LDFLAGS_save=$LDFLAGS
LIBS_save=$LIBS

OPAL_LOG_MSG([$1_CPPFLAGS : $$1_CPPFLAGS], 1)
OPAL_LOG_MSG([$1_LDFLAGS : $$1_LDFLAGS], 1)
OPAL_LOG_MSG([$1_LIBS : $$1_LIBS], 1)

OPAL_CHECK_PACKAGE([$1],
[api/mccl.h],
[$ompi_check_mccl_libs],
[mccl_init_context],
[],
[$ompi_check_mccl_dir],
[],
[ompi_check_mccl_happy="yes"],
[ompi_check_mccl_happy="no"])

AS_IF([test "$ompi_check_mccl_happy" = "yes"],
[
CPPFLAGS=$coll_mccl_CPPFLAGS
LDFLAGS=$coll_mccl_LDFLAGS
LIBS=$coll_mccl_LIBS
AC_CHECK_FUNCS(mccl_comm_free, [], [])
],
[])

CPPFLAGS=$CPPFLAGS_save
LDFLAGS=$LDFLAGS_save
LIBS=$LIBS_save],
[ompi_check_mccl_happy=no])

AS_IF([test "$ompi_check_mccl_happy" = "yes" && test "$enable_progress_threads" = "yes"],
[AC_MSG_WARN([mccl driver does not currently support progress threads. Disabling MCCL.])
ompi_check_mccl_happy="no"])

AS_IF([test "$ompi_check_mccl_happy" = "yes"],
[$2],
[AS_IF([test ! -z "$with_mccl" && test "$with_mccl" != "no"],
[AC_MSG_ERROR([MCCL support requested but not found. Aborting])])
$3])

OPAL_VAR_SCOPE_POP
])
4 changes: 3 additions & 1 deletion ompi/mca/coll/base/coll_tags.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
#define MCA_COLL_BASE_TAG_SCAN -24
#define MCA_COLL_BASE_TAG_SCATTER -25
#define MCA_COLL_BASE_TAG_SCATTERV -26
#define MCA_COLL_BASE_TAG_NONBLOCKING_BASE -27
#define MCA_COLL_BASE_TAG_MCCL -27

#define MCA_COLL_BASE_TAG_NONBLOCKING_BASE -28
#define MCA_COLL_BASE_TAG_NONBLOCKING_END ((-1 * INT_MAX/2) + 1)
#define MCA_COLL_BASE_TAG_NEIGHBOR_BASE (MCA_COLL_BASE_TAG_NONBLOCKING_END - 1)
#define MCA_COLL_BASE_TAG_NEIGHBOR_END (MCA_COLL_BASE_TAG_NEIGHBOR_BASE - 1024)
Expand Down
46 changes: 46 additions & 0 deletions ompi/mca/coll/mccl/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- shell-script -*-
#
#
# Copyright (c) 2020 Mellanox Technologies. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
#

AM_CPPFLAGS = $(coll_mccl_CPPFLAGS)

coll_mccl_sources = \
coll_mccl.h \
coll_mccl_debug.h \
coll_mccl_dtypes.h \
coll_mccl_module.c \
coll_mccl_component.c \
coll_mccl_ops.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_ompi_coll_mccl_DSO
component_noinst =
component_install = mca_coll_mccl.la
else
component_noinst = libmca_coll_mccl.la
component_install =
endif

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_mccl_la_SOURCES = $(coll_mccl_sources)
mca_coll_mccl_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
$(coll_mccl_LIBS)
mca_coll_mccl_la_LDFLAGS = -module -avoid-version $(coll_mccl_LDFLAGS)

noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_mccl_la_SOURCES = $(coll_mccl_sources)
libmca_coll_mccl_la_LIBADD = $(coll_mccl_LIBS)
libmca_coll_mccl_la_LDFLAGS = -module -avoid-version $(coll_mccl_LDFLAGS)

103 changes: 103 additions & 0 deletions ompi/mca/coll/mccl/coll_mccl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/**
Copyright (c) 2020 Mellanox Technologies. All rights reserved.
$COPYRIGHT$
Additional copyrights may follow
$HEADER$
*/

#ifndef MCA_COLL_MCCL_H
#define MCA_COLL_MCCL_H

#include "ompi_config.h"

#include "mpi.h"
#include "ompi/mca/mca.h"
#include "opal/memoryhooks/memory.h"
#include "opal/mca/memory/base/base.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/request/request.h"
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/coll/base/coll_tags.h"
#include "ompi/communicator/communicator.h"
#include "ompi/attribute/attribute.h"
#include "ompi/op/op.h"

#include "orte/runtime/orte_globals.h"

#include "api/mccl.h"

#include "coll_mccl_debug.h"
#ifndef MCCL_VERSION
#define MCCL_VERSION(major, minor) (((major)<<MCCL_MAJOR_BIT)|((minor)<<MCCL_MINOR_BIT))
#endif
BEGIN_C_DECLS

struct mca_coll_mccl_component_t {
/** Base coll component */
mca_coll_base_component_2_0_0_t super;

/** MCA parameter: Priority of this component */
int mccl_priority;

/** MCA parameter: Verbose level of this component */
int mccl_verbose;

/** MCA parameter: Enable MCCL */
int mccl_enable;

/** r/o MCA parameter: libmccl compiletime version */
char* compiletime_version;

/** r/o MCA parameter: libmccl runtime version */
const char* runtime_version;

/** MCA parameter: Minimal number of processes in the communicator
for the corresponding mccl context to be created */
int mccl_np;

/** Whether or not mccl_init was ever called */
bool libmccl_initialized;
mccl_context_h mccl_context;
opal_free_list_t requests;
};
typedef struct mca_coll_mccl_component_t mca_coll_mccl_component_t;

OMPI_MODULE_DECLSPEC extern mca_coll_mccl_component_t mca_coll_mccl_component;

/**
* MCCL enabled communicator
*/
struct mca_coll_mccl_module_t {
mca_coll_base_module_t super;
ompi_communicator_t* comm;
int rank;
mccl_comm_h mccl_comm;
mca_coll_base_module_allreduce_fn_t previous_allreduce;
mca_coll_base_module_t* previous_allreduce_module;
mca_coll_base_module_barrier_fn_t previous_barrier;
mca_coll_base_module_t* previous_barrier_module;
mca_coll_base_module_bcast_fn_t previous_bcast;
mca_coll_base_module_t* previous_bcast_module;
};
typedef struct mca_coll_mccl_module_t mca_coll_mccl_module_t;
OBJ_CLASS_DECLARATION(mca_coll_mccl_module_t);



int mca_coll_mccl_init_query(bool enable_progress_threads, bool enable_mpi_threads);
mca_coll_base_module_t *mca_coll_mccl_comm_query(struct ompi_communicator_t *comm, int *priority);


int mca_coll_mccl_allreduce(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype,
struct ompi_op_t *op, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_mccl_barrier(struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);
int mca_coll_mccl_bcast(void *buf, int count, struct ompi_datatype_t *dtype,
int root, struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

END_C_DECLS
#endif
Loading

0 comments on commit e2222fd

Please sign in to comment.