Skip to content

Commit

Permalink
[StreamExecutor] Add Doxygen main page
Browse files Browse the repository at this point in the history
Reviewers: jlebar

Subscribers: jprice, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D24066

llvm-svn: 280277
  • Loading branch information
henline committed Aug 31, 2016
1 parent bd850f4 commit 5b363dd
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 5 deletions.
1 change: 1 addition & 0 deletions parallel-libs/streamexecutor/CMakeLists.txt
Expand Up @@ -62,6 +62,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter")

add_subdirectory(lib)
add_subdirectory(examples)

if (STREAM_EXECUTOR_ENABLE_DOXYGEN)
find_package(Doxygen REQUIRED)
Expand Down
4 changes: 2 additions & 2 deletions parallel-libs/streamexecutor/Doxyfile.in
Expand Up @@ -398,7 +398,7 @@ LOOKUP_CACHE_SIZE = 0
# normally produced when WARNINGS is set to YES.
# The default value is: NO.

EXTRACT_ALL = NO
EXTRACT_ALL = YES

# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
# be included in the documentation.
Expand Down Expand Up @@ -811,7 +811,7 @@ EXCLUDE_SYMBOLS =
# that contain example code fragments that are included (see the \include
# command).

EXAMPLE_PATH =
EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@

# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
Expand Down
2 changes: 2 additions & 0 deletions parallel-libs/streamexecutor/examples/CMakeLists.txt
@@ -0,0 +1,2 @@
add_executable(example Example.cpp)
target_link_libraries(example streamexecutor)
163 changes: 163 additions & 0 deletions parallel-libs/streamexecutor/examples/Example.cpp
@@ -0,0 +1,163 @@
//===-- Example.cpp - Example code for documentation ----------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains example code demonstrating the usage of the
/// StreamExecutor API. Snippets of this file will be included as code examples
/// in documentation. Taking these examples from a real source file guarantees
/// that the examples will always compile.
///
//===----------------------------------------------------------------------===//

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <memory>
#include <vector>

#include "streamexecutor/StreamExecutor.h"

/// [Example saxpy host helper functions]
// Example handler for streamexecutor::Expected return values.
template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) {
if (!E) {
std::fprintf(stderr, "Error extracting an expected value: %s.\n",
streamexecutor::consumeAndGetMessage(E.takeError()).c_str());
std::exit(EXIT_FAILURE);
}
return std::move(*E);
}

// Example handler for streamexecutor::Error return values.
void check(streamexecutor::Error &&E) {
if (E) {
std::fprintf(stderr, "Error encountered: %s.\n",
streamexecutor::consumeAndGetMessage(std::move(E)).c_str());
std::exit(EXIT_FAILURE);
}
}
/// [Example saxpy host helper functions]

/// [Example saxpy compiler-generated]
// Code in this namespace is generated by the compiler (e.g. clang).
//
// The name of this namespace may depend on the compiler that generated it, so
// this is just an example name.
namespace __compilergen {

// Specialization of the streamexecutor::Kernel template class for the parameter
// types of the saxpy(float A, float *X, float *Y) kernel.
using SaxpyKernel =
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
streamexecutor::GlobalDeviceMemory<float>>;

// A string containing the PTX code generated by the device compiler for the
// saxpy kernel. String contents not shown here.
extern const char *SaxpyPTX;

// A global instance of a loader spec that knows how to load the code in the
// SaxpyPTX string.
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
streamexecutor::MultiKernelLoaderSpec Spec;
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
return Spec;
}();

} // namespace __compilergen
/// [Example saxpy compiler-generated]

/// [Example saxpy host PTX]
const char *__compilergen::SaxpyPTX = R"(
.version 4.3
.target sm_20
.address_size 64
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
.reg .f32 %AValue;
.reg .f32 %XValue;
.reg .f32 %YValue;
.reg .f32 %Result;
.reg .b64 %XBaseAddrGeneric;
.reg .b64 %YBaseAddrGeneric;
.reg .b64 %XBaseAddrGlobal;
.reg .b64 %YBaseAddrGlobal;
.reg .b64 %XAddr;
.reg .b64 %YAddr;
.reg .b64 %ThreadByteOffset;
.reg .b32 %TID;
ld.param.f32 %AValue, [A];
ld.param.u64 %XBaseAddrGeneric, [X];
ld.param.u64 %YBaseAddrGeneric, [Y];
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
mov.u32 %TID, %tid.x;
mul.wide.u32 %ThreadByteOffset, %TID, 4;
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
ld.global.f32 %XValue, [%XAddr];
ld.global.f32 %YValue, [%YAddr];
fma.rn.f32 %Result, %AValue, %XValue, %YValue;
st.global.f32 [%XAddr], %Result;
ret;
}
)";
/// [Example saxpy host PTX]

int main() {
/// [Example saxpy host main]
namespace se = ::streamexecutor;
namespace cg = ::__compilergen;

// Create some host data.
float A = 42.0f;
std::vector<float> HostX = {0, 1, 2, 3};
std::vector<float> HostY = {4, 5, 6, 7};
size_t ArraySize = HostX.size();

// Get a device object.
se::Platform *Platform =
getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
if (Platform->getDeviceCount() == 0) {
return EXIT_FAILURE;
}
se::Device *Device = getOrDie(Platform->getDevice(0));

// Load the kernel onto the device.
std::unique_ptr<cg::SaxpyKernel> Kernel =
getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));

// Allocate memory on the device.
se::GlobalDeviceMemory<float> X =
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
se::GlobalDeviceMemory<float> Y =
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));

// Run operations on a stream.
std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream());
Stream->thenCopyH2D<float>(HostX, X)
.thenCopyH2D<float>(HostY, Y)
.thenLaunch(ArraySize, 1, *Kernel, A, X, Y)
.thenCopyD2H<float>(X, HostX);
// Wait for the stream to complete.
check(Stream->blockHostUntilDone());

// Process output data in HostX.
std::vector<float> ExpectedX = {4, 47, 90, 133};
for (size_t I = 0; I < ArraySize; ++I) {
assert(HostX[I] == ExpectedX[I]);
}

// Free device memory.
check(Device->freeDeviceMemory(X));
check(Device->freeDeviceMemory(Y));
/// [Example saxpy host main]
}
Expand Up @@ -55,7 +55,7 @@
/// function as follows:
/// \code
/// namespace ccn = compiler_cuda_namespace;
/// using KernelPtr = std::unique_ptr<cnn::SaxpyKernel>;
/// using KernelPtr = std::unique_ptr<ccn::SaxpyKernel>;
/// // Assumes Device is a pointer to the Device on which to launch the
/// // kernel.
/// //
Expand Down
Expand Up @@ -115,8 +115,9 @@ class CUDAPTXInMemorySpec : public KernelLoaderSpec {
/// Adds each item in SpecList to this object.
///
/// Does not take ownership of the PTXCode pointers in the SpecList elements.
CUDAPTXInMemorySpec(llvm::StringRef KernelName,
const llvm::ArrayRef<PTXSpec> SpecList);
CUDAPTXInMemorySpec(
llvm::StringRef KernelName,
const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);

/// Returns a pointer to the PTX code for the requested compute capability.
///
Expand Down
@@ -0,0 +1,71 @@
//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

/// \mainpage Getting Started
///
/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming
/// models (runtimes). This abstraction cleanly permits host code to target
/// either CUDA or OpenCL devices with identically-functioning data parallel
/// kernels. It manages the execution of concurrent work targeting the
/// accelerator, similar to a host-side Executor.
///
/// This version of StreamExecutor can be built either as a sub-project of the
/// LLVM project or as a standalone project depending on LLVM as an external
/// package.
///
/// Below is an example of the use of the StreamExecutor API:
///
/// \snippet examples/Example.cpp Example saxpy host main
///
/// In the example, a couple of handler functions are used to handle error
/// return values in the StreamExecutor API:
///
/// \snippet examples/Example.cpp Example saxpy host helper functions
///
/// These are just example handler functions. A real application will likely
/// want to define similar handlers of its own that log errors in an
/// application-specific way, convert errors to the application's own
/// error-handling framework, or try to recover from errors as appropriate.
///
/// The example also references some symbols from a compiler-generated
/// namespace:
///
/// \snippet examples/Example.cpp Example saxpy compiler-generated
///
/// Instead of depending on the compiler to generate this code, you can
/// technically write the code yourself, but this is not recommended because the
/// code is very error-prone. For example, the template parameters for the
/// Kernel specialization have to match the parameter types for the device
/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid
/// device code for the kernel. Errors in this code will not show up until
/// runtime, and may only show up as garbage output rather than an explicit
/// error, which can be very hard to debug, so again, it is strongly advised not
/// to write this code yourself.
///
/// The example compiler-generated code uses a PTX string in the source code to
/// store the device code, but the device code can also be stored in other
/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be
/// stored for other platforms such as OpenCL, and StreamExecutor will pick the
/// right device code at runtime based on the user's platform selection. See
/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be
/// stored for different platforms, but again, the code to set up the
/// MultiKernelLoaderSpec instance should be generated by the compiler if
/// possible, not by the user.

#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H
#define STREAMEXECUTOR_STREAMEXECUTOR_H

#include "Device.h"
#include "Kernel.h"
#include "KernelSpec.h"
#include "Platform.h"
#include "PlatformManager.h"
#include "Stream.h"

#endif // STREAMEXECUTOR_STREAMEXECUTOR_H

0 comments on commit 5b363dd

Please sign in to comment.