Skip to content

Commit

Permalink
GPUToCUDA: emit addressof directly instead of wrapping it into a gett…
Browse files Browse the repository at this point in the history
…er function

Originally, the CUBIN getter function was introduced as a mechanism to
circumvent the absence of globals in the LLVM dialect. It would allocate memory
and populate it with the CUBIN data. LLVM dialect now supports globals and they
are already used to store CUBIN data, making the getter function a trivial
address computation of a global. Emit the address computation directly at the
place of `gpu.launch_func` instead of putting it in a function and calling it.
This simplifies the conversion flow and prepares it for using the
DialectConversion infrastructure.

PiperOrigin-RevId: 273496221
  • Loading branch information
ftynse authored and tensorflower-gardener committed Oct 8, 2019
1 parent 16af592 commit 52e082b
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 77 deletions.
78 changes: 13 additions & 65 deletions mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
Expand Up @@ -52,8 +52,6 @@ static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
static constexpr const char *kMcuMemHostRegisterPtr = "mcuMemHostRegisterPtr";

static constexpr const char *kCubinAnnotation = "nvvm.cubin";
static constexpr const char *kCubinGetterAnnotation = "nvvm.cubingetter";
static constexpr const char *kCubinGetterSuffix = "_cubin";
static constexpr const char *kCubinStorageSuffix = "_cubin_cst";

namespace {
Expand Down Expand Up @@ -124,7 +122,6 @@ class GpuLaunchFuncToCudaCallsPass
Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
OpBuilder &builder);
FuncOp generateCubinAccessor(FuncOp kernelFunc, StringAttr blob);
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);

public:
Expand All @@ -136,21 +133,6 @@ class GpuLaunchFuncToCudaCallsPass
initializeCachedTypes();

getModule().walk([this](mlir::gpu::LaunchFuncOp op) {
auto gpuModule =
getModule().lookupSymbol<ModuleOp>(op.getKernelModuleName());
auto kernelFunc = gpuModule.lookupSymbol<FuncOp>(op.kernel());
auto cubinAttr = kernelFunc.getAttrOfType<StringAttr>(kCubinAnnotation);
if (!cubinAttr)
return signalPassFailure();
FuncOp getter = generateCubinAccessor(kernelFunc, cubinAttr);

// Store the name of the getter on the function for easier lookup and
// remove the original CUBIN annotation.
kernelFunc.setAttr(
kCubinGetterAnnotation,
SymbolRefAttr::get(getter.getName(), getter.getContext()));
kernelFunc.removeAttr(kCubinAnnotation);

translateGpuLaunchCalls(op);
});

Expand Down Expand Up @@ -335,42 +317,6 @@ Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
llvmDialect);
}

// Inserts a global constant string containing `blob` into the grand-parent
// module of `kernelFunc` and generates the function that returns the address of
// the first character of this string.
FuncOp GpuLaunchFuncToCudaCallsPass::generateCubinAccessor(FuncOp kernelFunc,
StringAttr blob) {
Location loc = kernelFunc.getLoc();
SmallString<128> nameBuffer(kernelFunc.getName());
ModuleOp module = getModule();
assert(kernelFunc.getParentOp() &&
kernelFunc.getParentOp()->getParentOp() == module &&
"expected one level of module nesting");

// Insert the getter function just after the GPU kernel module containing
// `kernelFunc`.
OpBuilder moduleBuilder(module.getBody());
moduleBuilder.setInsertionPointAfter(kernelFunc.getParentOp());
auto getterType = moduleBuilder.getFunctionType(
llvm::None, LLVM::LLVMType::getInt8PtrTy(llvmDialect));
nameBuffer.append(kCubinGetterSuffix);
auto result = moduleBuilder.create<FuncOp>(
loc, StringRef(nameBuffer), getterType, ArrayRef<NamedAttribute>());
Block *entryBlock = result.addEntryBlock();

// Drop the getter suffix before appending the storage suffix.
nameBuffer.resize(kernelFunc.getName().size());
nameBuffer.append(kCubinStorageSuffix);

// Obtain the address of the first character of the global string containing
// the cubin and return from the getter.
OpBuilder builder(entryBlock);
Value *startPtr = LLVM::createGlobalString(
loc, builder, StringRef(nameBuffer), blob.getValue(), llvmDialect);
builder.create<LLVM::ReturnOp>(loc, startPtr);
return result;
}

// Emits LLVM IR to launch a kernel function. Expects the module that contains
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
// kernel function in the IR.
Expand Down Expand Up @@ -399,31 +345,33 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(

auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(0));
// Emit a call to the cubin getter to retrieve a pointer to the data that
// represents the cubin at runtime.
// TODO(herhut): This should rather be a static global once supported.
// Create an LLVM global with CUBIN extracted from the kernel annotation and
// obtain a pointer to the first byte in it.
auto kernelModule =
getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");
auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
assert(kernelFunction && "expected a kernel function");

auto cubinGetter =
kernelFunction.getAttrOfType<SymbolRefAttr>(kCubinGetterAnnotation);
if (!cubinGetter) {
kernelFunction.emitError("missing ")
<< kCubinGetterAnnotation << " attribute.";
auto cubinAttr = kernelFunction.getAttrOfType<StringAttr>(kCubinAnnotation);
if (!cubinAttr) {
kernelFunction.emitOpError()
<< "missing " << kCubinAnnotation << " attribute";
return signalPassFailure();
}
auto data = builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getPointerType()}, cubinGetter, ArrayRef<Value *>{});
assert(kernelModule.getName() && "expected a named module");
SmallString<128> nameBuffer(*kernelModule.getName());
nameBuffer.append(kCubinStorageSuffix);
Value *data = LLVM::createGlobalString(
loc, builder, nameBuffer.str(), cubinAttr.getValue(), getLLVMDialect());

// Emit the load module call to load the module data. Error checking is done
// in the called helper function.
auto cuModule = allocatePointer(builder, loc);
FuncOp cuModuleLoad = getModule().lookupSymbol<FuncOp>(cuModuleLoadName);
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
builder.getSymbolRefAttr(cuModuleLoad),
ArrayRef<Value *>{cuModule, data.getResult(0)});
ArrayRef<Value *>{cuModule, data});
// Get the function from the module. The name corresponds to the name of
// the kernel function.
auto cuOwningModuleRef =
Expand Down
20 changes: 8 additions & 12 deletions mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
Expand Up @@ -10,23 +10,19 @@ module attributes {gpu.container_module} {
attributes { gpu.kernel, nvvm.cubin = "CUBIN" }
}

// CHECK: func @[[getter:.*]]() -> !llvm<"i8*">
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
// CHECK: %[[gep:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
// CHECK-SAME: -> !llvm<"i8*">
// CHECK: llvm.return %[[gep]] : !llvm<"i8*">

func @foo() {
%0 = "op"() : () -> (!llvm.float)
%1 = "op"() : () -> (!llvm<"float*">)
%cst = constant 8 : index

// CHECK: [[cubin_ptr:%.*]] = llvm.call @[[getter]]
// CHECK: [[module_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
// CHECK: llvm.call @mcuModuleLoad([[module_ptr]], [[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
// CHECK: [[func_ptr:%.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
// CHECK: llvm.call @mcuModuleGetFunction([[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
// CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
// CHECK-SAME: -> !llvm<"i8*">
// CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
// CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
// CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
// CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
// CHECK: llvm.call @mcuGetStreamHelper
// CHECK: llvm.call @mcuLaunchKernel
// CHECK: llvm.call @mcuStreamSynchronize
Expand Down

0 comments on commit 52e082b

Please sign in to comment.