[mlir] Use target-specific GPU kernel attributes in lowering pipelines

Until now, the GPU translation to NVVM or ROCDL intrinsics relied on the presence of the generic `gpu.kernel` attribute to attach additional LLVM IR metadata to the relevant functions. This would be problematic if each dialect were to handle the conversion of its own options, which is the intended direction for the translation infrastructure. Introduce `nvvm.kernel` and `rocdl.kernel` in addition to `gpu.kernel` and base translation on these new attributes instead. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D96591
llvm · Feb 12, 2021 · 4c4876c · 4c4876c
1 parent 85fe5c9
commit 4c4876c
Show file tree

Hide file tree

Showing 15 changed files with 218 additions and 140 deletions.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -24,6 +24,12 @@ def NVVM_Dialect : Dialect {
   let name = "nvvm";
   let cppNamespace = "::mlir::NVVM";
   let dependentDialects = ["LLVM::LLVMDialect"];
+
+  let extraClassDeclaration = [{
+    /// Get the name of the attribute used to annotate external kernel
+    /// functions.
+    static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; }
+  }];
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -24,6 +24,12 @@ def ROCDL_Dialect : Dialect {
   let name = "rocdl";
   let cppNamespace = "::mlir::ROCDL";
   let dependentDialects = ["LLVM::LLVMDialect"];
+
+  let extraClassDeclaration = [{
+    /// Get the name of the attribute used to annotate external kernel
+    /// functions.
+    static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+  }];
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -17,6 +17,7 @@ endif()
 add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
   ConvertLaunchFuncToRuntimeCalls.cpp
   ConvertKernelFuncToBlob.cpp
+  GPUOpsLowering.cpp
 
   DEPENDS
   MLIRConversionPassIncGen

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -0,0 +1,148 @@
+//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "GPUOpsLowering.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace mlir;
+
+LogicalResult
+GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+                                   ArrayRef<Value> operands,
+                                   ConversionPatternRewriter &rewriter) const {
+  assert(operands.empty() && "func op is not expected to have operands");
+  Location loc = gpuFuncOp.getLoc();
+
+  SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
+  workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+  for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+    Value attribution = en.value();
+
+    auto type = attribution.getType().dyn_cast<MemRefType>();
+    assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+    uint64_t numElements = type.getNumElements();
+
+    auto elementType =
+        typeConverter->convertType(type.getElementType()).template cast<Type>();
+    auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
+    std::string name = std::string(
+        llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
+    auto globalOp = rewriter.create<LLVM::GlobalOp>(
+        gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+        LLVM::Linkage::Internal, name, /*value=*/Attribute(),
+        gpu::GPUDialect::getWorkgroupAddressSpace());
+    workgroupBuffers.push_back(globalOp);
+  }
+
+  // Rewrite the original GPU function to an LLVM function.
+  auto funcType = typeConverter->convertType(gpuFuncOp.getType())
+                      .template cast<LLVM::LLVMPointerType>()
+                      .getElementType();
+
+  // Remap proper input types.
+  TypeConverter::SignatureConversion signatureConversion(
+      gpuFuncOp.front().getNumArguments());
+  getTypeConverter()->convertFunctionSignature(
+      gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
+
+  // Create the new function operation. Only copy those attributes that are
+  // not specific to function modeling.
+  SmallVector<NamedAttribute, 4> attributes;
+  for (const auto &attr : gpuFuncOp.getAttrs()) {
+    if (attr.first == SymbolTable::getSymbolAttrName() ||
+        attr.first == impl::getTypeAttrName() ||
+        attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
+      continue;
+    attributes.push_back(attr);
+  }
+  // Add a dialect specific kernel attribute in addition to GPU kernel
+  // attribute. The former is necessary for further translation while the
+  // latter is expected by gpu.launch_func.
+  if (gpuFuncOp.isKernel())
+    attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+  auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
+      gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
+      LLVM::Linkage::External, attributes);
+
+  {
+    // Insert operations that correspond to converted workgroup and private
+    // memory attributions to the body of the function. This must operate on
+    // the original function, before the body region is inlined in the new
+    // function to maintain the relation between block arguments and the
+    // parent operation that assigns their semantics.
+    OpBuilder::InsertionGuard guard(rewriter);
+
+    // Rewrite workgroup memory attributions to addresses of global buffers.
+    rewriter.setInsertionPointToStart(&gpuFuncOp.front());
+    unsigned numProperArguments = gpuFuncOp.getNumArguments();
+    auto i32Type = IntegerType::get(rewriter.getContext(), 32);
+
+    Value zero = nullptr;
+    if (!workgroupBuffers.empty())
+      zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
+                                               rewriter.getI32IntegerAttr(0));
+    for (auto en : llvm::enumerate(workgroupBuffers)) {
+      LLVM::GlobalOp global = en.value();
+      Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
+      auto elementType =
+          global.getType().cast<LLVM::LLVMArrayType>().getElementType();
+      Value memory = rewriter.create<LLVM::GEPOp>(
+          loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
+          address, ArrayRef<Value>{zero, zero});
+
+      // Build a memref descriptor pointing to the buffer to plug with the
+      // existing memref infrastructure. This may use more registers than
+      // otherwise necessary given that memref sizes are fixed, but we can try
+      // and canonicalize that away later.
+      Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
+      auto type = attribution.getType().cast<MemRefType>();
+      auto descr = MemRefDescriptor::fromStaticShape(
+          rewriter, loc, *getTypeConverter(), type, memory);
+      signatureConversion.remapInput(numProperArguments + en.index(), descr);
+    }
+
+    // Rewrite private memory attributions to alloca'ed buffers.
+    unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
+    auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
+    for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
+      Value attribution = en.value();
+      auto type = attribution.getType().cast<MemRefType>();
+      assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+      // Explicitly drop memory space when lowering private memory
+      // attributions since NVVM models it as `alloca`s in the default
+      // memory space and does not support `alloca`s with addrspace(5).
+      auto ptrType = LLVM::LLVMPointerType::get(
+          typeConverter->convertType(type.getElementType())
+              .template cast<Type>(),
+          allocaAddrSpace);
+      Value numElements = rewriter.create<LLVM::ConstantOp>(
+          gpuFuncOp.getLoc(), int64Ty,
+          rewriter.getI64IntegerAttr(type.getNumElements()));
+      Value allocated = rewriter.create<LLVM::AllocaOp>(
+          gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
+      auto descr = MemRefDescriptor::fromStaticShape(
+          rewriter, loc, *getTypeConverter(), type, allocated);
+      signatureConversion.remapInput(
+          numProperArguments + numWorkgroupAttributions + en.index(), descr);
+    }
+  }
+
+  // Move the region to the new function, update the entry block signature.
+  rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
+                              llvmFuncOp.end());
+  if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
+                                         &signatureConversion)))
+    return failure();
+
+  rewriter.eraseOp(gpuFuncOp);
+  return success();
+}
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -11,145 +11,26 @@
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Builders.h"
-#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 
-template <unsigned AllocaAddrSpace>
 struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
-  using ConvertOpToLLVMPattern<gpu::GPUFuncOp>::ConvertOpToLLVMPattern;
+  GPUFuncOpLowering(LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+                    Identifier kernelAttributeName)
+      : ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
+        allocaAddrSpace(allocaAddrSpace),
+        kernelAttributeName(kernelAttributeName) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    assert(operands.empty() && "func op is not expected to have operands");
-    Location loc = gpuFuncOp.getLoc();
-
-    SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
-    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-    for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
-      Value attribution = en.value();
-
-      auto type = attribution.getType().dyn_cast<MemRefType>();
-      assert(type && type.hasStaticShape() && "unexpected type in attribution");
-
-      uint64_t numElements = type.getNumElements();
-
-      auto elementType = typeConverter->convertType(type.getElementType())
-                             .template cast<Type>();
-      auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
-      std::string name = std::string(
-          llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
-      auto globalOp = rewriter.create<LLVM::GlobalOp>(
-          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
-          LLVM::Linkage::Internal, name, /*value=*/Attribute(),
-          gpu::GPUDialect::getWorkgroupAddressSpace());
-      workgroupBuffers.push_back(globalOp);
-    }
-
-    // Rewrite the original GPU function to an LLVM function.
-    auto funcType = typeConverter->convertType(gpuFuncOp.getType())
-                        .template cast<LLVM::LLVMPointerType>()
-                        .getElementType();
-
-    // Remap proper input types.
-    TypeConverter::SignatureConversion signatureConversion(
-        gpuFuncOp.front().getNumArguments());
-    getTypeConverter()->convertFunctionSignature(
-        gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
-
-    // Create the new function operation. Only copy those attributes that are
-    // not specific to function modeling.
-    SmallVector<NamedAttribute, 4> attributes;
-    for (const auto &attr : gpuFuncOp.getAttrs()) {
-      if (attr.first == SymbolTable::getSymbolAttrName() ||
-          attr.first == impl::getTypeAttrName() ||
-          attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
-        continue;
-      attributes.push_back(attr);
-    }
-    auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-        gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
-        LLVM::Linkage::External, attributes);
+                  ConversionPatternRewriter &rewriter) const override;
 
-    {
-      // Insert operations that correspond to converted workgroup and private
-      // memory attributions to the body of the function. This must operate on
-      // the original function, before the body region is inlined in the new
-      // function to maintain the relation between block arguments and the
-      // parent operation that assigns their semantics.
-      OpBuilder::InsertionGuard guard(rewriter);
+private:
+  /// The address spcae to use for `alloca`s in private memory.
+  unsigned allocaAddrSpace;
 
-      // Rewrite workgroup memory attributions to addresses of global buffers.
-      rewriter.setInsertionPointToStart(&gpuFuncOp.front());
-      unsigned numProperArguments = gpuFuncOp.getNumArguments();
-      auto i32Type = IntegerType::get(rewriter.getContext(), 32);
-
-      Value zero = nullptr;
-      if (!workgroupBuffers.empty())
-        zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
-                                                 rewriter.getI32IntegerAttr(0));
-      for (auto en : llvm::enumerate(workgroupBuffers)) {
-        LLVM::GlobalOp global = en.value();
-        Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
-        auto elementType =
-            global.getType().cast<LLVM::LLVMArrayType>().getElementType();
-        Value memory = rewriter.create<LLVM::GEPOp>(
-            loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
-            address, ArrayRef<Value>{zero, zero});
-
-        // Build a memref descriptor pointing to the buffer to plug with the
-        // existing memref infrastructure. This may use more registers than
-        // otherwise necessary given that memref sizes are fixed, but we can try
-        // and canonicalize that away later.
-        Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
-        auto type = attribution.getType().cast<MemRefType>();
-        auto descr = MemRefDescriptor::fromStaticShape(
-            rewriter, loc, *getTypeConverter(), type, memory);
-        signatureConversion.remapInput(numProperArguments + en.index(), descr);
-      }
-
-      // Rewrite private memory attributions to alloca'ed buffers.
-      unsigned numWorkgroupAttributions =
-          gpuFuncOp.getNumWorkgroupAttributions();
-      auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
-      for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
-        Value attribution = en.value();
-        auto type = attribution.getType().cast<MemRefType>();
-        assert(type && type.hasStaticShape() &&
-               "unexpected type in attribution");
-
-        // Explicitly drop memory space when lowering private memory
-        // attributions since NVVM models it as `alloca`s in the default
-        // memory space and does not support `alloca`s with addrspace(5).
-        auto ptrType = LLVM::LLVMPointerType::get(
-            typeConverter->convertType(type.getElementType())
-                .template cast<Type>(),
-            AllocaAddrSpace);
-        Value numElements = rewriter.create<LLVM::ConstantOp>(
-            gpuFuncOp.getLoc(), int64Ty,
-            rewriter.getI64IntegerAttr(type.getNumElements()));
-        Value allocated = rewriter.create<LLVM::AllocaOp>(
-            gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
-        auto descr = MemRefDescriptor::fromStaticShape(
-            rewriter, loc, *getTypeConverter(), type, allocated);
-        signatureConversion.remapInput(
-            numProperArguments + numWorkgroupAttributions + en.index(), descr);
-      }
-    }
-
-    // Move the region to the new function, update the entry block signature.
-    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
-                                llvmFuncOp.end());
-    if (failed(rewriter.convertRegionTypes(
-            &llvmFuncOp.getBody(), *typeConverter, &signatureConversion)))
-      return failure();
-
-    rewriter.eraseOp(gpuFuncOp);
-    return success();
-  }
+  /// The attribute name to use instead of `gpu.kernel`.
+  Identifier kernelAttributeName;
 };
 
 struct GPUReturnOpLowering : public ConvertOpToLLVMPattern<gpu::ReturnOp> {

diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRGPUToNVVMTransforms
 
   LINK_LIBS PUBLIC
   MLIRGPU
+  MLIRGPUToGPURuntimeTransforms
   MLIRLLVMIR
   MLIRNVVMIR
   MLIRPass

diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -167,11 +167,16 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                           NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
               GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
                                           NVVM::GridDimYOp, NVVM::GridDimZOp>,
-              GPUShuffleOpLowering, GPUReturnOpLowering,
-              // Explicitly drop memory space when lowering private memory
-              // attributions since NVVM models it as `alloca`s in the default
-              // memory space and does not support `alloca`s with addrspace(5).
-              GPUFuncOpLowering<0>>(converter);
+              GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
+
+  // Explicitly drop memory space when lowering private memory
+  // attributions since NVVM models it as `alloca`s in the default
+  // memory space and does not support `alloca`s with addrspace(5).
+  patterns.insert<GPUFuncOpLowering>(
+      converter, /*allocaAddrSpace=*/0,
+      Identifier::get(NVVM::NVVMDialect::getKernelFuncAttrName(),
+                      &converter.getContext()));
+
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
                                                 "__nv_fabs");
   patterns.insert<OpToFuncCallLowering<math::AtanOp>>(converter, "__nv_atanf",

diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms
 
   LINK_LIBS PUBLIC
   MLIRGPU
+  MLIRGPUToGPURuntimeTransforms
   MLIRLLVMIR
   MLIRROCDLIR
   MLIRPass

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -103,7 +103,11 @@ void mlir::populateGpuToROCDLConversionPatterns(
                                   ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
       GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
                                   ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
-      GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
+      GPUReturnOpLowering>(converter);
+  patterns.insert<GPUFuncOpLowering>(
+      converter, /*allocaAddrSpace=*/5,
+      Identifier::get(ROCDL::ROCDLDialect::getKernelFuncAttrName(),
+                      &converter.getContext()));
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
                                                 "__ocml_fabs_f64");
   patterns.insert<OpToFuncCallLowering<math::AtanOp>>(