GPUToCUDA: attach CUBIN to the nested module rather than to the function

Originally, we were attaching attributes containing CUBIN blobs to the kernel function called by `gpu.launch_func`. This kernel is now contained in a nested module that is used as a compilation unit. Attach compiled CUBIN blobs to the module rather than to the function since we were compiling the module. This also avoids duplication of the attribute on multiple kernels within the same module. PiperOrigin-RevId: 273497303
llvm · Oct 8, 2019 · 11d1267 · 11d1267
1 parent 52e082b
commit 11d1267
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 64 deletions.
diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h
@@ -38,7 +38,8 @@ class LLVMDialect;
 template <typename T> class OpPassBase;
 
 using OwnedCubin = std::unique_ptr<std::vector<char>>;
-using CubinGenerator = std::function<OwnedCubin(const std::string &, FuncOp &)>;
+using CubinGenerator =
+    std::function<OwnedCubin(const std::string &, Location, StringRef)>;
 
 /// Creates a pass to convert kernel functions into CUBIN blobs.
 ///

diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp
@@ -62,8 +62,10 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
       : cubinGenerator(cubinGenerator) {}
 
   void runOnModule() override {
-    if (!getModule().getAttrOfType<UnitAttr>(
-            gpu::GPUDialect::getKernelModuleAttrName()))
+    ModuleOp module = getModule();
+    if (!module.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()) ||
+        !module.getName())
       return;
 
     // Make sure the NVPTX target is initialized.
@@ -72,31 +74,35 @@ class GpuKernelToCubinPass : public ModulePass<GpuKernelToCubinPass> {
     LLVMInitializeNVPTXTargetMC();
     LLVMInitializeNVPTXAsmPrinter();
 
-    auto llvmModule = translateModuleToNVVMIR(getModule());
+    auto llvmModule = translateModuleToNVVMIR(module);
     if (!llvmModule)
       return signalPassFailure();
 
-    for (auto function : getModule().getOps<FuncOp>()) {
-      if (!gpu::GPUDialect::isKernel(function))
-        continue;
-      if (failed(translateGpuKernelToCubinAnnotation(*llvmModule, function)))
-        signalPassFailure();
-    }
+    // Translate the module to CUBIN and attach the result as attribute to the
+    // module.
+    if (auto cubinAttr = translateGpuModuleToCubinAnnotation(
+            *llvmModule, module.getLoc(), *module.getName()))
+      module.setAttr(kCubinAnnotation, cubinAttr);
+    else
+      signalPassFailure();
   }
 
 private:
   static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx,
-                                                FuncOp &function);
+                                                Location, StringRef);
 
   std::string translateModuleToPtx(llvm::Module &module,
                                    llvm::TargetMachine &target_machine);
 
-  /// Converts llvmModule to cubin using the user-provded generator.
-  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, FuncOp &function);
+  /// Converts llvmModule to cubin using the user-provded generator. Location is
+  /// used for error reporting and name is forwarded to the CUBIN generator to
+  /// use in its logging mechanisms.
+  OwnedCubin convertModuleToCubin(llvm::Module &llvmModule, Location loc,
+                                  StringRef name);
 
-  /// Translates llvmModule to cubin and assigns it to attribute of function.
-  LogicalResult translateGpuKernelToCubinAnnotation(llvm::Module &llvmModule,
-                                                    FuncOp &function);
+  /// Translates llvmModule to cubin and returns the result as attribute.
+  StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule,
+                                                 Location loc, StringRef name);
 
   CubinGenerator cubinGenerator;
 };
@@ -120,13 +126,14 @@ std::string GpuKernelToCubinPass::translateModuleToPtx(
 
 OwnedCubin
 GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx,
-                                                  FuncOp &function) {
+                                                  Location, StringRef) {
   const char data[] = "CUBIN";
   return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
 }
 
 OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
-                                                      FuncOp &function) {
+                                                      Location loc,
+                                                      StringRef name) {
   std::unique_ptr<llvm::TargetMachine> targetMachine;
   {
     std::string error;
@@ -136,7 +143,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
     const llvm::Target *target =
         llvm::TargetRegistry::lookupTarget("", triple, error);
     if (target == nullptr) {
-      function.emitError("cannot initialize target triple");
+      emitError(loc, "cannot initialize target triple");
       return {};
     }
     targetMachine.reset(
@@ -148,26 +155,15 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule,
 
   auto ptx = translateModuleToPtx(llvmModule, *targetMachine);
 
-  return cubinGenerator(ptx, function);
+  return cubinGenerator(ptx, loc, name);
 }
 
-LogicalResult GpuKernelToCubinPass::translateGpuKernelToCubinAnnotation(
-    llvm::Module &llvmModule, FuncOp &function) {
-  auto cubin = convertModuleToCubin(llvmModule, function);
+StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation(
+    llvm::Module &llvmModule, Location loc, StringRef name) {
+  auto cubin = convertModuleToCubin(llvmModule, loc, name);
   if (!cubin)
-    return function.emitError("translation to CUDA binary failed.");
-
-  Builder builder(function.getContext());
-  function.setAttr(kCubinAnnotation,
-                   builder.getStringAttr({cubin->data(), cubin->size()}));
-
-  // Remove the body of the kernel function now that it has been translated.
-  // The main reason to do this is so that the resulting module no longer
-  // contains the NVVM instructions (typically contained in the kernel bodies)
-  // and hence can be compiled into host code by a separate pass.
-  function.eraseBody();
-
-  return success();
+    return {};
+  return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext());
 }
 
 std::unique_ptr<OpPassBase<ModuleOp>>

diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
@@ -120,7 +120,7 @@ class GpuLaunchFuncToCudaCallsPass
 
   void declareCudaFunctions(Location loc);
   Value *setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
-  Value *generateKernelNameConstant(FuncOp kernelFunction, Location &loc,
+  Value *generateKernelNameConstant(StringRef name, Location &loc,
                                     OpBuilder &builder);
   void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
 
@@ -304,14 +304,12 @@ GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
 //   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
 // }
 Value *GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
-    FuncOp kernelFunction, Location &loc, OpBuilder &builder) {
+    StringRef name, Location &loc, OpBuilder &builder) {
   // Make sure the trailing zero is included in the constant.
-  std::vector<char> kernelName(kernelFunction.getName().begin(),
-                               kernelFunction.getName().end());
+  std::vector<char> kernelName(name.begin(), name.end());
   kernelName.push_back('\0');
 
-  std::string globalName =
-      llvm::formatv("{0}_kernel_name", kernelFunction.getName());
+  std::string globalName = llvm::formatv("{0}_kernel_name", name);
   return LLVM::createGlobalString(
       loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
       llvmDialect);
@@ -350,12 +348,10 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
   auto kernelModule =
       getModule().lookupSymbol<ModuleOp>(launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
-  auto kernelFunction = kernelModule.lookupSymbol<FuncOp>(launchOp.kernel());
-  assert(kernelFunction && "expected a kernel function");
 
-  auto cubinAttr = kernelFunction.getAttrOfType<StringAttr>(kCubinAnnotation);
+  auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
   if (!cubinAttr) {
-    kernelFunction.emitOpError()
+    kernelModule.emitOpError()
         << "missing " << kCubinAnnotation << " attribute";
     return signalPassFailure();
   }
@@ -376,7 +372,7 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
   // the kernel function.
   auto cuOwningModuleRef =
       builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
-  auto kernelName = generateKernelNameConstant(kernelFunction, loc, builder);
+  auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder);
   auto cuFunction = allocatePointer(builder, loc);
   FuncOp cuModuleGetFunction =
       getModule().lookupSymbol<FuncOp>(cuModuleGetFunctionName);

diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir
@@ -5,9 +5,9 @@ module attributes {gpu.container_module} {
   // CHECK: llvm.mlir.global constant @[[kernel_name:.*]]("kernel\00")
   // CHECK: llvm.mlir.global constant @[[global:.*]]("CUBIN")
 
-  module @kernel_module attributes {gpu.kernel_module} {
+  module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} {
     func @kernel(!llvm.float, !llvm<"float*">)
-        attributes { gpu.kernel, nvvm.cubin = "CUBIN" }
+        attributes { gpu.kernel }
   }
 
   func @foo() {

diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir
@@ -1,10 +1,9 @@
 // RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s
 
-module attributes {gpu.kernel_module} {
+// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"}
+module @kernels attributes {gpu.kernel_module} {
   func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
-    // CHECK: attributes  {gpu.kernel, nvvm.cubin = "CUBIN"}
     attributes  { gpu.kernel } {
-    // CHECK-NOT: llvm.return
     llvm.return
   }
 }

diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp
@@ -43,24 +43,25 @@
 using namespace mlir;
 
 inline void emit_cuda_error(const llvm::Twine &message, const char *buffer,
-                            CUresult error, FuncOp &function) {
-  function.emitError(message.concat(" failed with error code ")
-                         .concat(llvm::Twine{error})
-                         .concat("[")
-                         .concat(buffer)
-                         .concat("]"));
+                            CUresult error, Location loc) {
+  emitError(loc, message.concat(" failed with error code ")
+                     .concat(llvm::Twine{error})
+                     .concat("[")
+                     .concat(buffer)
+                     .concat("]"));
 }
 
 #define RETURN_ON_CUDA_ERROR(expr, msg)                                        \
   {                                                                            \
     auto _cuda_error = (expr);                                                 \
     if (_cuda_error != CUDA_SUCCESS) {                                         \
-      emit_cuda_error(msg, jitErrorBuffer, _cuda_error, function);             \
+      emit_cuda_error(msg, jitErrorBuffer, _cuda_error, loc);                  \
       return {};                                                               \
     }                                                                          \
   }
 
-OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
+OwnedCubin compilePtxToCubin(const std::string ptx, Location loc,
+                             StringRef name) {
   char jitErrorBuffer[4096] = {0};
 
   RETURN_ON_CUDA_ERROR(cuInit(0), "cuInit");
@@ -86,10 +87,10 @@ OwnedCubin compilePtxToCubin(const std::string ptx, FuncOp &function) {
   RETURN_ON_CUDA_ERROR(
       cuLinkAddData(linkState, CUjitInputType::CU_JIT_INPUT_PTX,
                     const_cast<void *>(static_cast<const void *>(ptx.c_str())),
-                    ptx.length(), function.getName().data(), /* kernel name */
-                    0,       /* number of jit options */
-                    nullptr, /* jit options */
-                    nullptr  /* jit option values */
+                    ptx.length(), name.data(), /* kernel name */
+                    0,                         /* number of jit options */
+                    nullptr,                   /* jit options */
+                    nullptr                    /* jit option values */
                     ),
       "cuLinkAddData");