Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[StreamExecutor] Add Doxygen main page
Reviewers: jlebar Subscribers: jprice, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24066 llvm-svn: 280277
- Loading branch information
Showing
7 changed files
with
243 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
add_executable(example Example.cpp) | ||
target_link_libraries(example streamexecutor) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
//===-- Example.cpp - Example code for documentation ----------------------===// | ||
// | ||
// The LLVM Compiler Infrastructure | ||
// | ||
// This file is distributed under the University of Illinois Open Source | ||
// License. See LICENSE.TXT for details. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
/// | ||
/// \file | ||
/// This file contains example code demonstrating the usage of the | ||
/// StreamExecutor API. Snippets of this file will be included as code examples | ||
/// in documentation. Taking these examples from a real source file guarantees | ||
/// that the examples will always compile. | ||
/// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include <cassert> | ||
#include <cstdio> | ||
#include <cstdlib> | ||
#include <memory> | ||
#include <vector> | ||
|
||
#include "streamexecutor/StreamExecutor.h" | ||
|
||
/// [Example saxpy host helper functions] | ||
// Example handler for streamexecutor::Expected return values. | ||
template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) { | ||
if (!E) { | ||
std::fprintf(stderr, "Error extracting an expected value: %s.\n", | ||
streamexecutor::consumeAndGetMessage(E.takeError()).c_str()); | ||
std::exit(EXIT_FAILURE); | ||
} | ||
return std::move(*E); | ||
} | ||
|
||
// Example handler for streamexecutor::Error return values. | ||
void check(streamexecutor::Error &&E) { | ||
if (E) { | ||
std::fprintf(stderr, "Error encountered: %s.\n", | ||
streamexecutor::consumeAndGetMessage(std::move(E)).c_str()); | ||
std::exit(EXIT_FAILURE); | ||
} | ||
} | ||
/// [Example saxpy host helper functions] | ||
|
||
/// [Example saxpy compiler-generated] | ||
// Code in this namespace is generated by the compiler (e.g. clang). | ||
// | ||
// The name of this namespace may depend on the compiler that generated it, so | ||
// this is just an example name. | ||
namespace __compilergen { | ||
|
||
// Specialization of the streamexecutor::Kernel template class for the parameter | ||
// types of the saxpy(float A, float *X, float *Y) kernel. | ||
using SaxpyKernel = | ||
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>, | ||
streamexecutor::GlobalDeviceMemory<float>>; | ||
|
||
// A string containing the PTX code generated by the device compiler for the | ||
// saxpy kernel. String contents not shown here. | ||
extern const char *SaxpyPTX; | ||
|
||
// A global instance of a loader spec that knows how to load the code in the | ||
// SaxpyPTX string. | ||
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() { | ||
streamexecutor::MultiKernelLoaderSpec Spec; | ||
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}}); | ||
return Spec; | ||
}(); | ||
|
||
} // namespace __compilergen | ||
/// [Example saxpy compiler-generated] | ||
|
||
/// [Example saxpy host PTX] | ||
const char *__compilergen::SaxpyPTX = R"( | ||
.version 4.3 | ||
.target sm_20 | ||
.address_size 64 | ||
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) { | ||
.reg .f32 %AValue; | ||
.reg .f32 %XValue; | ||
.reg .f32 %YValue; | ||
.reg .f32 %Result; | ||
.reg .b64 %XBaseAddrGeneric; | ||
.reg .b64 %YBaseAddrGeneric; | ||
.reg .b64 %XBaseAddrGlobal; | ||
.reg .b64 %YBaseAddrGlobal; | ||
.reg .b64 %XAddr; | ||
.reg .b64 %YAddr; | ||
.reg .b64 %ThreadByteOffset; | ||
.reg .b32 %TID; | ||
ld.param.f32 %AValue, [A]; | ||
ld.param.u64 %XBaseAddrGeneric, [X]; | ||
ld.param.u64 %YBaseAddrGeneric, [Y]; | ||
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric; | ||
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric; | ||
mov.u32 %TID, %tid.x; | ||
mul.wide.u32 %ThreadByteOffset, %TID, 4; | ||
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal; | ||
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal; | ||
ld.global.f32 %XValue, [%XAddr]; | ||
ld.global.f32 %YValue, [%YAddr]; | ||
fma.rn.f32 %Result, %AValue, %XValue, %YValue; | ||
st.global.f32 [%XAddr], %Result; | ||
ret; | ||
} | ||
)"; | ||
/// [Example saxpy host PTX] | ||
|
||
int main() { | ||
/// [Example saxpy host main] | ||
namespace se = ::streamexecutor; | ||
namespace cg = ::__compilergen; | ||
|
||
// Create some host data. | ||
float A = 42.0f; | ||
std::vector<float> HostX = {0, 1, 2, 3}; | ||
std::vector<float> HostY = {4, 5, 6, 7}; | ||
size_t ArraySize = HostX.size(); | ||
|
||
// Get a device object. | ||
se::Platform *Platform = | ||
getOrDie(se::PlatformManager::getPlatformByName("CUDA")); | ||
if (Platform->getDeviceCount() == 0) { | ||
return EXIT_FAILURE; | ||
} | ||
se::Device *Device = getOrDie(Platform->getDevice(0)); | ||
|
||
// Load the kernel onto the device. | ||
std::unique_ptr<cg::SaxpyKernel> Kernel = | ||
getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec)); | ||
|
||
// Allocate memory on the device. | ||
se::GlobalDeviceMemory<float> X = | ||
getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); | ||
se::GlobalDeviceMemory<float> Y = | ||
getOrDie(Device->allocateDeviceMemory<float>(ArraySize)); | ||
|
||
// Run operations on a stream. | ||
std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream()); | ||
Stream->thenCopyH2D<float>(HostX, X) | ||
.thenCopyH2D<float>(HostY, Y) | ||
.thenLaunch(ArraySize, 1, *Kernel, A, X, Y) | ||
.thenCopyD2H<float>(X, HostX); | ||
// Wait for the stream to complete. | ||
check(Stream->blockHostUntilDone()); | ||
|
||
// Process output data in HostX. | ||
std::vector<float> ExpectedX = {4, 47, 90, 133}; | ||
for (size_t I = 0; I < ArraySize; ++I) { | ||
assert(HostX[I] == ExpectedX[I]); | ||
} | ||
|
||
// Free device memory. | ||
check(Device->freeDeviceMemory(X)); | ||
check(Device->freeDeviceMemory(Y)); | ||
/// [Example saxpy host main] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
parallel-libs/streamexecutor/include/streamexecutor/StreamExecutor.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===// | ||
// | ||
// The LLVM Compiler Infrastructure | ||
// | ||
// This file is distributed under the University of Illinois Open Source | ||
// License. See LICENSE.TXT for details. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
/// \mainpage Getting Started | ||
/// | ||
/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming | ||
/// models (runtimes). This abstraction cleanly permits host code to target | ||
/// either CUDA or OpenCL devices with identically-functioning data parallel | ||
/// kernels. It manages the execution of concurrent work targeting the | ||
/// accelerator, similar to a host-side Executor. | ||
/// | ||
/// This version of StreamExecutor can be built either as a sub-project of the | ||
/// LLVM project or as a standalone project depending on LLVM as an external | ||
/// package. | ||
/// | ||
/// Below is an example of the use of the StreamExecutor API: | ||
/// | ||
/// \snippet examples/Example.cpp Example saxpy host main | ||
/// | ||
/// In the example, a couple of handler functions are used to handle error | ||
/// return values in the StreamExecutor API: | ||
/// | ||
/// \snippet examples/Example.cpp Example saxpy host helper functions | ||
/// | ||
/// These are just example handler functions. A real application will likely | ||
/// want to define similar handlers of its own that log errors in an | ||
/// application-specific way, convert errors to the application's own | ||
/// error-handling framework, or try to recover from errors as appropriate. | ||
/// | ||
/// The example also references some symbols from a compiler-generated | ||
/// namespace: | ||
/// | ||
/// \snippet examples/Example.cpp Example saxpy compiler-generated | ||
/// | ||
/// Instead of depending on the compiler to generate this code, you can | ||
/// technically write the code yourself, but this is not recommended because the | ||
/// code is very error-prone. For example, the template parameters for the | ||
/// Kernel specialization have to match the parameter types for the device | ||
/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid | ||
/// device code for the kernel. Errors in this code will not show up until | ||
/// runtime, and may only show up as garbage output rather than an explicit | ||
/// error, which can be very hard to debug, so again, it is strongly advised not | ||
/// to write this code yourself. | ||
/// | ||
/// The example compiler-generated code uses a PTX string in the source code to | ||
/// store the device code, but the device code can also be stored in other | ||
/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be | ||
/// stored for other platforms such as OpenCL, and StreamExecutor will pick the | ||
/// right device code at runtime based on the user's platform selection. See | ||
/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be | ||
/// stored for different platforms, but again, the code to set up the | ||
/// MultiKernelLoaderSpec instance should be generated by the compiler if | ||
/// possible, not by the user. | ||
|
||
#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H | ||
#define STREAMEXECUTOR_STREAMEXECUTOR_H | ||
|
||
#include "Device.h" | ||
#include "Kernel.h" | ||
#include "KernelSpec.h" | ||
#include "Platform.h" | ||
#include "PlatformManager.h" | ||
#include "Stream.h" | ||
|
||
#endif // STREAMEXECUTOR_STREAMEXECUTOR_H |