diff --git a/openmp/libomptarget/include/dlwrap.h b/openmp/libomptarget/include/dlwrap.h new file mode 100644 index 00000000000000..9e078b34ca5771 --- /dev/null +++ b/openmp/libomptarget/include/dlwrap.h @@ -0,0 +1,277 @@ +//===------- dlwrap.h - Convenience wrapper around dlopen/dlsym -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The openmp plugins depend on extern libraries. These can be used via: +// - bitcode file statically linked +// - (relocatable) object file statically linked +// - static library +// - dynamic library, linked at build time +// - dynamic library, loaded at application run time by dlopen +// +// This file factors out most boilerplate for using a dlopened library. +// - Function symbols are generated that are statically linked against +// - The dlopen can be done implicitly when initializing the library +// - dlsym lookups are done once and cached +// - The abstraction is very thin to permit varied uses of the library +// +// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to: +// int foo(char x0, double x1, void* x2) { +// constexpr size_t index = id(); +// void * dlsymResult = pointer(index); +// return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2); +// } +// +// Multiple calls to DLWRAP(symbol_name, arity) with bespoke +// initialization code that can use the thin abstraction: +// namespace dlwrap { +// static size_t size(); +// static const char *symbol(size_t); +// static void **pointer(size_t); +// } +// will compile to an object file that only exposes the symbols that the +// dynamic library would do, with the right function types. +// +//===----------------------------------------------------------------------===// + +#ifndef DLWRAP_H_INCLUDED +#define DLWRAP_H_INCLUDED + +#include +#include +#include +#include + +// Where symbol is a function, these expand to some book keeping and an +// implementation of that function +#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY) +#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) + +// For example, given a prototype: +// int foo(char, double); +// +// DLWRAP(foo, 2) expands to: +// +// namespace dlwrap { +// struct foo_Trait : public dlwrap::trait { +// using T = dlwrap::trait; +// static T::FunctionType get() { +// constexpr size_t Index = getIndex(); +// void *P = *dlwrap::pointer(Index); +// return reinterpret_cast(P); +// } +// }; +// } +// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); } +// +// DLWRAP_INTERNAL is similar, except the function it expands to is: +// static int dlwrap_foo(char x0, double x1) { ... } +// so that the function pointer call can be wrapped in library-specific code + +// DLWRAP_FINALIZE() expands to definitions of: +#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL() +namespace dlwrap { +static size_t size(); +static const char *symbol(size_t); // get symbol name in [0, size()) +static void **pointer(size_t); // get pointer to function pointer in [0, size()) +} // namespace dlwrap + +// Implementation details follow. + +namespace dlwrap { + +// Extract return / argument types from address of function symbol +template struct trait; +template struct trait { + constexpr static const size_t nargs = sizeof...(Ts); + typedef R ReturnType; + template struct arg { + typedef typename std::tuple_element>::type type; + }; + + typedef R (*FunctionType)(Ts...); +}; + +namespace type { +// Book keeping is by type specialization + +template struct count { + static constexpr size_t N = count::N; +}; + +template <> struct count<0> { static constexpr size_t N = 0; }; + +// Get a constexpr size_t ID, starts at zero +#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N) + +// Increment value returned by DLWRAP_ID +#define DLWRAP_INC() \ + template <> struct dlwrap::type::count<__LINE__> { \ + static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N; \ + } + +template struct symbol; +#define DLWRAP_SYMBOL(SYMBOL, ID) \ + template <> struct dlwrap::type::symbol { \ + static constexpr const char *call() { return #SYMBOL; } \ + } +} // namespace type + +template +constexpr std::array static getSymbolArray( + std::index_sequence) { + return {{dlwrap::type::symbol::call()...}}; +} + +} // namespace dlwrap + +#define DLWRAP_INSTANTIATE(SYM_USE, SYM_DEF, ARITY) \ + DLWRAP_INSTANTIATE_##ARITY(SYM_USE, SYM_DEF, \ + dlwrap::trait) + +#define DLWRAP_FINALIZE_IMPL() \ + static size_t dlwrap::size() { return DLWRAP_ID(); } \ + static const char *dlwrap::symbol(size_t i) { \ + static constexpr const std::array \ + dlwrap_symbols = getSymbolArray( \ + std::make_index_sequence()); \ + return dlwrap_symbols[i]; \ + } \ + static void **dlwrap::pointer(size_t i) { \ + static std::array dlwrap_pointers; \ + return &dlwrap_pointers.data()[i]; \ + } + +#define DLWRAP_COMMON(SYMBOL, ARITY) \ + DLWRAP_INC(); \ + DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \ + namespace dlwrap { \ + struct SYMBOL##_Trait : public dlwrap::trait { \ + using T = dlwrap::trait; \ + static T::FunctionType get() { \ + constexpr size_t Index = DLWRAP_ID() - 1; \ + void *P = *dlwrap::pointer(Index); \ + return reinterpret_cast(P); \ + } \ + }; \ + } + +#define DLWRAP_IMPL(SYMBOL, ARITY) \ + DLWRAP_COMMON(SYMBOL, ARITY); \ + DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY) + +#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \ + DLWRAP_COMMON(SYMBOL, ARITY); \ + static DLWRAP_INSTANTIATE(SYMBOL, dlwrap_##SYMBOL, ARITY) + +#define DLWRAP_INSTANTIATE_0(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); } +#define DLWRAP_INSTANTIATE_1(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \ + return dlwrap::SYM_USE##_Trait::get()(x0); \ + } +#define DLWRAP_INSTANTIATE_2(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1); \ + } +#define DLWRAP_INSTANTIATE_3(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \ + } +#define DLWRAP_INSTANTIATE_4(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \ + } +#define DLWRAP_INSTANTIATE_5(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \ + } +#define DLWRAP_INSTANTIATE_6(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \ + } + +#define DLWRAP_INSTANTIATE_7(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \ + } + +#define DLWRAP_INSTANTIATE_8(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \ + } +#define DLWRAP_INSTANTIATE_9(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \ + } +#define DLWRAP_INSTANTIATE_10(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8, \ + typename T::template arg<9>::type x9) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ + x9); \ + } +#define DLWRAP_INSTANTIATE_11(SYM_USE, SYM_DEF, T) \ + T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \ + typename T::template arg<1>::type x1, \ + typename T::template arg<2>::type x2, \ + typename T::template arg<3>::type x3, \ + typename T::template arg<4>::type x4, \ + typename T::template arg<5>::type x5, \ + typename T::template arg<6>::type x6, \ + typename T::template arg<7>::type x7, \ + typename T::template arg<8>::type x8, \ + typename T::template arg<9>::type x9, \ + typename T::template arg<10>::type x10) { \ + return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \ + x9, x10); \ + } + +#endif diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt index 93887154bf1906..e5b2edfa664e5f 100644 --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -15,12 +15,6 @@ if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") return() -elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND) - libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") - return() -elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) - libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.") - return() endif() libomptarget_say("Building CUDA offloading plugin.") @@ -28,10 +22,22 @@ libomptarget_say("Building CUDA offloading plugin.") # Define the suffix for the runtime messaging dumps. add_definitions(-DTARGET_NAME=CUDA) -include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) -add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) +option(LIBOMPTARGET_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF) + +if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND + AND NOT LIBOMPTARGET_DLOPEN_LIBCUDA) + libomptarget_say("Building CUDA plugin linked against libcuda") + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) + add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}) +else() + libomptarget_say("Building CUDA plugin for dlopened libcuda") + include_directories(dynamic_cuda) + add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp) + set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS}) +endif() # Install plugin under the lib destination folder. install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") @@ -39,7 +45,7 @@ install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR} target_link_libraries(omptarget.rtl.cuda elf_common MemoryManager - ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBRARIES} ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" "-Wl,-z,defs") diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp new file mode 100644 index 00000000000000..cc7bc42412f627 --- /dev/null +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp @@ -0,0 +1,99 @@ +//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implement subset of cuda api by calling into cuda library via dlopen +// Does the dlopen/dlsym calls as part of the call to cuInit +// +//===----------------------------------------------------------------------===// + +#include "cuda.h" +#include "Debug.h" +#include "dlwrap.h" + +#include + +DLWRAP_INTERNAL(cuInit, 1); + +DLWRAP(cuCtxGetDevice, 1); +DLWRAP(cuDeviceGet, 2); +DLWRAP(cuDeviceGetAttribute, 3); +DLWRAP(cuDeviceGetCount, 1); +DLWRAP(cuFuncGetAttribute, 3); + +DLWRAP(cuGetErrorString, 2); +DLWRAP(cuLaunchKernel, 11); + +DLWRAP(cuMemAlloc, 2); +DLWRAP(cuMemcpyDtoDAsync, 4); + +DLWRAP(cuMemcpyDtoH, 3); +DLWRAP(cuMemcpyDtoHAsync, 4); +DLWRAP(cuMemcpyHtoD, 3); +DLWRAP(cuMemcpyHtoDAsync, 4); + +DLWRAP(cuMemFree, 1); +DLWRAP(cuModuleGetFunction, 3); +DLWRAP(cuModuleGetGlobal, 4); + +DLWRAP(cuModuleUnload, 1); +DLWRAP(cuStreamCreate, 2); +DLWRAP(cuStreamDestroy, 1); +DLWRAP(cuStreamSynchronize, 1); +DLWRAP(cuCtxSetCurrent, 1); +DLWRAP(cuDevicePrimaryCtxRelease, 1); +DLWRAP(cuDevicePrimaryCtxGetState, 3); +DLWRAP(cuDevicePrimaryCtxSetFlags, 2); +DLWRAP(cuDevicePrimaryCtxRetain, 2); +DLWRAP(cuModuleLoadDataEx, 5); + +DLWRAP(cuDeviceCanAccessPeer, 3); +DLWRAP(cuCtxEnablePeerAccess, 2); +DLWRAP(cuMemcpyPeerAsync, 6); + +DLWRAP_FINALIZE(); + +#ifndef DYNAMIC_CUDA_PATH +#define DYNAMIC_CUDA_PATH "libcuda.so" +#endif + +#define TARGET_NAME CUDA +#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" + +static bool checkForCUDA() { + // return true if dlopen succeeded and all functions found + + const char *CudaLib = DYNAMIC_CUDA_PATH; + void *DynlibHandle = dlopen(CudaLib, RTLD_NOW); + if (!DynlibHandle) { + DP("Unable to load library '%s': %s!\n", CudaLib, dlerror()); + return false; + } + + for (size_t I = 0; I < dlwrap::size(); I++) { + const char *Sym = dlwrap::symbol(I); + + void *P = dlsym(DynlibHandle, Sym); + if (P == nullptr) { + DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib); + return false; + } + + *dlwrap::pointer(I) = P; + } + + return true; +} + +CUresult cuInit(unsigned X) { + // Note: Called exactly once from cuda rtl.cpp in a global constructor so + // does not need to handle being called repeatedly or concurrently + if (!checkForCUDA()) { + return CUDA_ERROR_INVALID_VALUE; + } + return dlwrap_cuInit(X); +} diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h new file mode 100644 index 00000000000000..832c2696514448 --- /dev/null +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -0,0 +1,104 @@ +//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The parts of the cuda api that are presently in use by the openmp cuda plugin +// +//===----------------------------------------------------------------------===// + +#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED +#define DYNAMIC_CUDA_CUDA_H_INCLUDED + +#include +#include + +typedef int CUdevice; +typedef uintptr_t CUdeviceptr; +typedef struct CUmod_st *CUmodule; +typedef struct CUctx_st *CUcontext; +typedef struct CUfunc_st *CUfunction; +typedef struct CUstream_st *CUstream; + +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, +} CUresult; + +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, + CU_STREAM_NON_BLOCKING = 0x1, +} CUstream_flags; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, +} CUdevice_attribute; + +typedef enum CUfunction_attribute_enum { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, +} CUfunction_attribute; + +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, + CU_CTX_SCHED_MASK = 0x07, +} CUctx_flags; + +#define cuMemFree cuMemFree_v2 +#define cuMemAlloc cuMemAlloc_v2 +#define cuMemcpyDtoH cuMemcpyDtoH_v2 +#define cuMemcpyHtoD cuMemcpyHtoD_v2 +#define cuStreamDestroy cuStreamDestroy_v2 +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2 +#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 +#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2 +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 + +CUresult cuCtxGetDevice(CUdevice *); +CUresult cuDeviceGet(CUdevice *, int); +CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); +CUresult cuDeviceGetCount(int *); +CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction); + +CUresult cuGetErrorString(CUresult, const char **); +CUresult cuInit(unsigned); +CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, + unsigned, unsigned, unsigned, CUstream, void **, + void **); + +CUresult cuMemAlloc(CUdeviceptr *, size_t); +CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); + +CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); +CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); +CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); +CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); + +CUresult cuMemFree(CUdeviceptr); +CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); +CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); + +CUresult cuModuleUnload(CUmodule); +CUresult cuStreamCreate(CUstream *, unsigned); +CUresult cuStreamDestroy(CUstream); +CUresult cuStreamSynchronize(CUstream); +CUresult cuCtxSetCurrent(CUcontext); +CUresult cuDevicePrimaryCtxRelease(CUdevice); +CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); +CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned); +CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice); +CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *, + void **); + +CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice); +CUresult cuCtxEnablePeerAccess(CUcontext, unsigned); +CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, + size_t, CUstream); + +#endif