[SE] Add CUDA platform

Summary: Basic CUDA platform implementation and cmake infrastructure to control whether it's used. A few important TODOs will be handled in later patches: * Log some error messages that can't easily be returned as Errors. * Cache modules and kernels to prevent reloading them if someone tries to reload a kernel that's already loaded. * Tolerate shared memory arguments for kernel launches. Reviewers: jlebar Subscribers: beanz, mgorny, jprice, jlebar, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24538 llvm-svn: 281524
llvm · Sep 14, 2016 · 6bfc863 · 6bfc863
1 parent d56a27e
commit 6bfc863
Show file tree

Hide file tree

Showing 13 changed files with 596 additions and 17 deletions.
diff --git a/parallel-libs/streamexecutor/CMakeLists.txt b/parallel-libs/streamexecutor/CMakeLists.txt
@@ -3,9 +3,14 @@ cmake_minimum_required(VERSION 3.1)
 option(STREAM_EXECUTOR_UNIT_TESTS "enable unit tests" ON)
 option(STREAM_EXECUTOR_ENABLE_DOXYGEN "enable StreamExecutor doxygen" ON)
 option(STREAM_EXECUTOR_ENABLE_CONFIG_TOOL "enable building streamexecutor-config tool" ON)
+option(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM "enable building the CUDA StreamExecutor platform" OFF)
+
+configure_file("include/streamexecutor/PlatformOptions.h.in" "include/streamexecutor/PlatformOptions.h")
 
 # First find includes relative to the streamexecutor top-level source path.
 include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+# Also look for configured headers in the top-level binary directory.
+include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include)
 
 # If we are not building as part of LLVM, build StreamExecutor as a standalone
 # project using LLVM as an external library:

diff --git a/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/PlatformDevice.h
@@ -37,33 +37,38 @@ class PlatformDevice {
 
   virtual std::string getName() const = 0;
 
+  virtual std::string getPlatformName() const = 0;
+
   /// Creates a platform-specific kernel.
   virtual Expected<const void *>
   createKernel(const MultiKernelLoaderSpec &Spec) {
-    return make_error("createKernel not implemented for platform " + getName());
+    return make_error("createKernel not implemented for platform " +
+                      getPlatformName());
   }
 
   virtual Error destroyKernel(const void *Handle) {
     return make_error("destroyKernel not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Creates a platform-specific stream.
   virtual Expected<const void *> createStream() {
-    return make_error("createStream not implemented for platform " + getName());
+    return make_error("createStream not implemented for platform " +
+                      getPlatformName());
   }
 
   virtual Error destroyStream(const void *Handle) {
     return make_error("destroyStream not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Launches a kernel on the given stream.
   virtual Error launch(const void *PlatformStreamHandle,
                        BlockDimensions BlockSize, GridDimensions GridSize,
                        const void *PKernelHandle,
                        const PackedKernelArgumentArrayBase &ArgumentArray) {
-    return make_error("launch not implemented for platform " + getName());
+    return make_error("launch not implemented for platform " +
+                      getPlatformName());
   }
 
   /// Copies data from the device to the host.
@@ -72,7 +77,8 @@ class PlatformDevice {
   virtual Error copyD2H(const void *PlatformStreamHandle,
                         const void *DeviceSrcHandle, size_t SrcByteOffset,
                         void *HostDst, size_t DstByteOffset, size_t ByteCount) {
-    return make_error("copyD2H not implemented for platform " + getName());
+    return make_error("copyD2H not implemented for platform " +
+                      getPlatformName());
   }
 
   /// Copies data from the host to the device.
@@ -81,47 +87,49 @@ class PlatformDevice {
   virtual Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
                         size_t SrcByteOffset, const void *DeviceDstHandle,
                         size_t DstByteOffset, size_t ByteCount) {
-    return make_error("copyH2D not implemented for platform " + getName());
+    return make_error("copyH2D not implemented for platform " +
+                      getPlatformName());
   }
 
   /// Copies data from one device location to another.
   virtual Error copyD2D(const void *PlatformStreamHandle,
                         const void *DeviceSrcHandle, size_t SrcByteOffset,
                         const void *DeviceDstHandle, size_t DstByteOffset,
                         size_t ByteCount) {
-    return make_error("copyD2D not implemented for platform " + getName());
+    return make_error("copyD2D not implemented for platform " +
+                      getPlatformName());
   }
 
   /// Blocks the host until the given stream completes all the work enqueued up
   /// to the point this function is called.
   virtual Error blockHostUntilDone(const void *PlatformStreamHandle) {
     return make_error("blockHostUntilDone not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Allocates untyped device memory of a given size in bytes.
   virtual Expected<void *> allocateDeviceMemory(size_t ByteCount) {
     return make_error("allocateDeviceMemory not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Frees device memory previously allocated by allocateDeviceMemory.
   virtual Error freeDeviceMemory(const void *Handle) {
     return make_error("freeDeviceMemory not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Registers previously allocated host memory so it can be used with copyH2D
   /// and copyD2H.
   virtual Error registerHostMemory(void *Memory, size_t ByteCount) {
     return make_error("registerHostMemory not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Unregisters host memory previously registered with registerHostMemory.
   virtual Error unregisterHostMemory(const void *Memory) {
     return make_error("unregisterHostMemory not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Copies the given number of bytes from device memory to host memory.
@@ -133,7 +141,7 @@ class PlatformDevice {
                                    size_t SrcByteOffset, void *HostDst,
                                    size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyD2H not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Similar to synchronousCopyD2H(const void *, size_t, void
@@ -143,7 +151,7 @@ class PlatformDevice {
                                    const void *DeviceDstHandle,
                                    size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyH2D not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 
   /// Similar to synchronousCopyD2H(const void *, size_t, void
@@ -154,7 +162,7 @@ class PlatformDevice {
                                    const void *DeviceDstHandle,
                                    size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyD2D not implemented for platform " +
-                      getName());
+                      getPlatformName());
   }
 };
 

diff --git a/parallel-libs/streamexecutor/include/streamexecutor/PlatformOptions.h.in b/parallel-libs/streamexecutor/include/streamexecutor/PlatformOptions.h.in
@@ -0,0 +1,23 @@
+//===-- PlatformOptions.h - Platform option macros --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This contents of this file are filled in at configuration time. This file
+/// defines macros that represent the platform configuration state of the build,
+/// e.g. which platforms are enabled.
+///
+//===----------------------------------------------------------------------===//
+
+
+#ifndef STREAMEXECUTOR_PLATFORMOPTIONS_H
+#define STREAMEXECUTOR_PLATFORMOPTIONS_H
+
+#cmakedefine STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
+
+#endif // STREAMEXECUTOR_PLATFORMOPTIONS_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatform.h
@@ -0,0 +1,42 @@
+//===-- CUDAPlatform.h - CUDA platform subclass -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the CUDAPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
+
+#include "streamexecutor/Platform.h"
+#include "streamexecutor/platforms/cuda/CUDAPlatformDevice.h"
+
+#include "llvm/Support/Mutex.h"
+
+#include <map>
+
+namespace streamexecutor {
+namespace cuda {
+
+class CUDAPlatform : public Platform {
+public:
+  size_t getDeviceCount() const override;
+
+  Expected<Device> getDevice(size_t DeviceIndex) override;
+
+private:
+  llvm::sys::Mutex Mutex;
+  std::map<size_t, CUDAPlatformDevice> PlatformDevices;
+};
+
+} // namespace cuda
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORM_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/cuda/CUDAPlatformDevice.h
@@ -0,0 +1,93 @@
+//===-- CUDAPlatformDevice.h - CUDAPlatformDevice class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the CUDAPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace cuda {
+
+Error CUresultToError(int CUResult, const llvm::Twine &Message);
+
+class CUDAPlatformDevice : public PlatformDevice {
+public:
+  static Expected<CUDAPlatformDevice> create(size_t DeviceIndex);
+
+  CUDAPlatformDevice(const CUDAPlatformDevice &) = delete;
+  CUDAPlatformDevice &operator=(const CUDAPlatformDevice &) = delete;
+
+  CUDAPlatformDevice(CUDAPlatformDevice &&) noexcept;
+  CUDAPlatformDevice &operator=(CUDAPlatformDevice &&) noexcept;
+
+  ~CUDAPlatformDevice() override;
+
+  std::string getName() const override;
+
+  std::string getPlatformName() const override { return "CUDA"; }
+
+  Expected<const void *>
+  createKernel(const MultiKernelLoaderSpec &Spec) override;
+  Error destroyKernel(const void *Handle) override;
+
+  Expected<const void *> createStream() override;
+  Error destroyStream(const void *Handle) override;
+
+  Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+               GridDimensions GridSize, const void *PKernelHandle,
+               const PackedKernelArgumentArrayBase &ArgumentArray) override;
+
+  Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+                size_t ByteCount) override;
+
+  Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override;
+
+  Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override;
+
+  Error blockHostUntilDone(const void *PlatformStreamHandle) override;
+
+  Expected<void *> allocateDeviceMemory(size_t ByteCount) override;
+  Error freeDeviceMemory(const void *Handle) override;
+
+  Error registerHostMemory(void *Memory, size_t ByteCount) override;
+  Error unregisterHostMemory(const void *Memory) override;
+
+  Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           void *HostDst, size_t DstByteOffset,
+                           size_t ByteCount) override;
+
+  Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override;
+
+  Error synchronousCopyD2D(const void *DeviceDstHandle, size_t DstByteOffset,
+                           const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           size_t ByteCount) override;
+
+private:
+  CUDAPlatformDevice(size_t DeviceIndex) : DeviceIndex(DeviceIndex) {}
+
+  int DeviceIndex;
+};
+
+} // namespace cuda
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_CUDA_CUDAPLATFORMDEVICE_H
diff --git a/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h b/parallel-libs/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
@@ -29,6 +29,8 @@ class HostPlatformDevice : public PlatformDevice {
 public:
   std::string getName() const override { return "host"; }
 
+  std::string getPlatformName() const override { return "host"; }
+
   Expected<const void *>
   createKernel(const MultiKernelLoaderSpec &Spec) override {
     if (!Spec.hasHostFunction()) {

diff --git a/parallel-libs/streamexecutor/lib/CMakeLists.txt b/parallel-libs/streamexecutor/lib/CMakeLists.txt
@@ -3,6 +3,26 @@ macro(add_se_library name)
   set_target_properties(${name} PROPERTIES FOLDER "streamexecutor libraries")
 endmacro(add_se_library)
 
+if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
+    set(
+        CMAKE_MODULE_PATH
+        ${CMAKE_MODULE_PATH}
+        "${CMAKE_CURRENT_SOURCE_DIR}/platforms/cuda/cmake/modules/")
+
+    find_package(Libcuda REQUIRED)
+    include_directories(${LIBCUDA_INCLUDE_DIRS})
+
+    set(
+        STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT
+        $<TARGET_OBJECTS:streamexecutor_cuda_platform>)
+
+    set(
+        STREAM_EXECUTOR_LIBCUDA_LIBRARIES
+        ${LIBCUDA_LIBRARIES})
+endif(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
+
+add_subdirectory(platforms)
+
 add_se_library(
     streamexecutor
     Device.cpp
@@ -16,6 +36,8 @@ add_se_library(
     PlatformDevice.cpp
     PlatformManager.cpp
     Stream.cpp
-    )
+    ${STREAM_EXECUTOR_CUDA_PLATFORM_TARGET_OBJECT}
+    LINK_LIBS
+    ${STREAM_EXECUTOR_LIBCUDA_LIBRARIES})
 
 install(TARGETS streamexecutor DESTINATION lib)
diff --git a/parallel-libs/streamexecutor/lib/PlatformManager.cpp b/parallel-libs/streamexecutor/lib/PlatformManager.cpp
@@ -13,8 +13,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "streamexecutor/PlatformManager.h"
+
+#include "streamexecutor/PlatformOptions.h"
 #include "streamexecutor/platforms/host/HostPlatform.h"
 
+#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
+#include "streamexecutor/platforms/cuda/CUDAPlatform.h"
+#endif
+
 namespace streamexecutor {
 
 PlatformManager::PlatformManager() {
@@ -26,6 +32,10 @@ PlatformManager::PlatformManager() {
   //    themselves when they are loaded.
 
   PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
+
+#ifdef STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM
+  PlatformsByName.emplace("cuda", llvm::make_unique<cuda::CUDAPlatform>());
+#endif
 }
 
 Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {

diff --git a/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
+    add_subdirectory(cuda)
+endif()
diff --git a/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt b/parallel-libs/streamexecutor/lib/platforms/cuda/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(
+    streamexecutor_cuda_platform
+    OBJECT
+    CUDAPlatform.cpp
+    CUDAPlatformDevice.cpp)