Skip to content

Commit

Permalink
GPUToCUDA: attach CUBIN to the nested module rather than to the function
Browse files Browse the repository at this point in the history
Originally, we were attaching attributes containing CUBIN blobs to the kernel
function called by `gpu.launch_func`. This kernel is now contained in a nested
module that is used as a compilation unit. Attach compiled CUBIN blobs to the
module rather than to the function since we were compiling the module. This
also avoids duplication of the attribute on multiple kernels within the same
module.

PiperOrigin-RevId: 273497303
  • Loading branch information
ftynse authored and tensorflower-gardener committed Oct 8, 2019
1 parent 52e082b commit 11d1267
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 64 deletions.
3 changes: 2 additions & 1 deletion mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
Expand Up @@ -38,7 +38,8 @@ class LLVMDialect;
template <typename T> class OpPassBase;

using OwnedCubin = std::unique_ptr<std::vector<char>>;
using CubinGenerator = std::function<OwnedCubin(const std::string &, FuncOp &)>;
using CubinGenerator =
std::function<OwnedCubin(const std::string &, Location, StringRef)>;

/// Creates a pass to convert kernel functions into CUBIN blobs.
///
Expand Down
66 changes: 31 additions & 35 deletions mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
Expand Up @@ -62,8 +62,10 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
: cubinGenerator(cubinGenerator) {}

void runOnModule() override {
if (!getModule().getAttrOfType<UnitAttr>(
gpu::GPUDialect::getKernelModuleAttrName()))
ModuleOp module = getModule();
if (!module.getAttrOfType<UnitAttr>(
gpu::GPUDialect::getKernelModuleAttrName()) ||
!module.getName())
return;

// Make sure the NVPTX target is initialized.
Expand All @@ -72,31 +74,35 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();

auto llvmModule = translateModuleToNVVMIR(getModule());
auto llvmModule = translateModuleToNVVMIR(module);
if (!llvmModule)
return signalPassFailure();

for (auto function : getModule().getOps<FuncOp>()) {
if (!gpu::GPUDialect::isKernel(function))
continue;
if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
signalPassFailure();
}
// Translate the module to CUBIN and attach the result as attribute to the
// module.
if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
*llvmModule, module.getLoc(), *module.getName()))
module.setAttr(kCubinAnnotation, cubinAttr);
else
signalPassFailure();
}

private:
static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
FuncOp &function);
Location, StringRef);

std::string translateModuleToPtx(llvm::Module &module,
llvm::TargetMachine &target_machine);

/// Converts llvmModule to cubin using the user-provded generator.
OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
/// Converts llvmModule to cubin using the user-provded generator. Location is
/// used for error reporting and name is forwarded to the CUBIN generator to
/// use in its logging mechanisms.
OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
StringRef name);

/// Translates llvmModule to cubin and assigns it to attribute of function.
LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
FuncOp &function);
/// Translates llvmModule to cubin and returns the result as attribute.
StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
Location loc, StringRef name);

CubinGenerator cubinGenerator;
};
Expand All @@ -120,13 +126,14 @@ std::string GpuKernelToCubinPass::translateModuleToPtx(

OwnedCubin
GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
FuncOp &function) {
Location, StringRef) {
const char data[] = "CUBIN";
return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
}

OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
FuncOp &function) {
Location loc,
StringRef name) {
std::unique_ptr<llvm::TargetMachine> targetMachine;
{
std::string error;
Expand All @@ -136,7 +143,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
const llvm::Target *target =
llvm::TargetRegistry::lookupTarget("", triple, error);
if (target == nullptr) {
function.emitError("cannot initialize target triple");
emitError(loc, "cannot initialize target triple");
return {};
}
targetMachine.reset(
Expand All @@ -148,26 +155,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,

auto ptx = translateModuleToPtx(llvmModule, *targetMachine);

return cubinGenerator(ptx, function);
return cubinGenerator(ptx, loc, name);
}

LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
llvm::Module &llvmModule, FuncOp &function) {
auto cubin = convertModuleToCubin(llvmModule, function);
StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
llvm::Module &llvmModule, Location loc, StringRef name) {
auto cubin = convertModuleToCubin(llvmModule, loc, name);
if (!cubin)
return function.emitError("translation to CUDA binary failed.");

Builder builder(function.getContext());
function.setAttr(kCubinAnnotation,
builder.getStringAttr({cubin->data(), cubin->size()}));

// Remove the body of the kernel function now that it has been translated.
// The main reason to do this is so that the resulting module no longer
// contains the NVVM instructions (typically contained in the kernel bodies)
// and hence can be compiled into host code by a separate pass.
function.eraseBody();

return success();
return {};
return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
}

std::unique_ptr<OpPassBase<ModuleOp>>
Expand Down
18 changes: 7 additions & 11 deletions mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
Expand Up @@ -120,7 +120,7 @@ class GpuLaunchFuncToCudaCallsPass

void declareCudaFunctions(Location loc);
Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
Value *generateKernelNameConstant(StringRef name, Location &loc,
OpBuilder &builder);
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);

Expand Down Expand Up @@ -304,14 +304,12 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
// }
Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
FuncOp kernelFunction, Location &loc, OpBuilder &builder) {
StringRef name, Location &loc, OpBuilder &builder) {
// Make sure the trailing zero is included in the constant.
std::vector<char> kernelName(kernelFunction.getName().begin(),
kernelFunction.getName().end());
std::vector<char> kernelName(name.begin(), name.end());
kernelName.push_back('\0');

std::string globalName =
llvm::formatv("{0}_kernel_name", kernelFunction.getName());
std::string globalName = llvm::formatv("{0}_kernel_name", name);
return LLVM::createGlobalString(
loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
llvmDialect);
Expand Down Expand Up @@ -350,12 +348,10 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
auto kernelModule =
getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");
auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
assert(kernelFunction && "expected a kernel function");

auto cubinAttr = kernelFunction.getAttrOfType<StringAttr>(kCubinAnnotation);
auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
if (!cubinAttr) {
kernelFunction.emitOpError()
kernelModule.emitOpError()
<< "missing " << kCubinAnnotation << " attribute";
return signalPassFailure();
}
Expand All @@ -376,7 +372,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
// the kernel function.
auto cuOwningModuleRef =
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder);
auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder);
auto cuFunction = allocatePointer(builder, loc);
FuncOp cuModuleGetFunction =
getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName);
Expand Down
4 changes: 2 additions & 2 deletions mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
Expand Up @@ -5,9 +5,9 @@ module attributes {gpu.container_module} {
// CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
// CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")

module @kernel_module attributes {gpu.kernel_module} {
module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} {
func @kernel(!llvm.float, !llvm<"float*">)
attributes { gpu.kernel, nvvm.cubin = "CUBIN" }
attributes { gpu.kernel }
}

func @foo() {
Expand Down
@@ -1,10 +1,9 @@
// RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s

module attributes {gpu.kernel_module} {
// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"}
module @kernels attributes {gpu.kernel_module} {
func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
// CHECK: attributes {gpu.kernel, nvvm.cubin = "CUBIN"}
attributes { gpu.kernel } {
// CHECK-NOT: llvm.return
llvm.return
}
}
Expand Down
25 changes: 13 additions & 12 deletions mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
Expand Up @@ -43,24 +43,25 @@
using namespace mlir;

inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
CUresult error, FuncOp &function) {
function.emitError(message.concat(" failed with error code ")
.concat(llvm::Twine{error})
.concat("[")
.concat(buffer)
.concat("]"));
CUresult error, Location loc) {
emitError(loc, message.concat(" failed with error code ")
.concat(llvm::Twine{error})
.concat("[")
.concat(buffer)
.concat("]"));
}

#define RETURN_ON_CUDA_ERROR(expr, msg) \
{ \
auto _cuda_error = (expr); \
if (_cuda_error != CUDA_SUCCESS) { \
emit_cuda_error(msg, jitErrorBuffer, _cuda_error, function); \
emit_cuda_error(msg, jitErrorBuffer, _cuda_error, loc); \
return {}; \
} \
}

OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
OwnedCubin compilePtxToCubin(const std::string ptx, Location loc,
StringRef name) {
char jitErrorBuffer[4096] = {0};

RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
Expand All @@ -86,10 +87,10 @@ OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
RETURN_ON_CUDA_ERROR(
cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX,
const_cast<void *>(static_cast<const void *>(ptx.c_str())),
ptx.length(), function.getName().data(), /* kernel name */
0, /* number of jit options */
nullptr, /* jit options */
nullptr /* jit option values */
ptx.length(), name.data(), /* kernel name */
0, /* number of jit options */
nullptr, /* jit options */
nullptr /* jit option values */
),
"cuLinkAddData");

Expand Down

0 comments on commit 11d1267

Please sign in to comment.