Skip to content

Commit

Permalink
[mlir] Use target-specific GPU kernel attributes in lowering pipelines
Browse files Browse the repository at this point in the history
Until now, the GPU translation to NVVM or ROCDL intrinsics relied on the
presence of the generic `gpu.kernel` attribute to attach additional LLVM IR
metadata to the relevant functions. This would be problematic if each dialect
were to handle the conversion of its own options, which is the intended
direction for the translation infrastructure. Introduce `nvvm.kernel` and
`rocdl.kernel` in addition to `gpu.kernel` and base translation on these new
attributes instead.

Reviewed By: herhut

Differential Revision: https://reviews.llvm.org/D96591
  • Loading branch information
ftynse committed Feb 12, 2021
1 parent 85fe5c9 commit 4c4876c
Show file tree
Hide file tree
Showing 15 changed files with 218 additions and 140 deletions.
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Expand Up @@ -24,6 +24,12 @@ def NVVM_Dialect : Dialect {
let name = "nvvm";
let cppNamespace = "::mlir::NVVM";
let dependentDialects = ["LLVM::LLVMDialect"];

let extraClassDeclaration = [{
/// Get the name of the attribute used to annotate external kernel
/// functions.
static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; }
}];
}

//===----------------------------------------------------------------------===//
Expand Down
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
Expand Up @@ -24,6 +24,12 @@ def ROCDL_Dialect : Dialect {
let name = "rocdl";
let cppNamespace = "::mlir::ROCDL";
let dependentDialects = ["LLVM::LLVMDialect"];

let extraClassDeclaration = [{
/// Get the name of the attribute used to annotate external kernel
/// functions.
static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
}];
}

//===----------------------------------------------------------------------===//
Expand Down
1 change: 1 addition & 0 deletions mlir/lib/Conversion/GPUCommon/CMakeLists.txt
Expand Up @@ -17,6 +17,7 @@ endif()
add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
ConvertLaunchFuncToRuntimeCalls.cpp
ConvertKernelFuncToBlob.cpp
GPUOpsLowering.cpp

DEPENDS
MLIRConversionPassIncGen
Expand Down
148 changes: 148 additions & 0 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -0,0 +1,148 @@
//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "GPUOpsLowering.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/Builders.h"
#include "llvm/Support/FormatVariadic.h"

using namespace mlir;

LogicalResult
GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
assert(operands.empty() && "func op is not expected to have operands");
Location loc = gpuFuncOp.getLoc();

SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
Value attribution = en.value();

auto type = attribution.getType().dyn_cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");

uint64_t numElements = type.getNumElements();

auto elementType =
typeConverter->convertType(type.getElementType()).template cast<Type>();
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name = std::string(
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(),
gpu::GPUDialect::getWorkgroupAddressSpace());
workgroupBuffers.push_back(globalOp);
}

// Rewrite the original GPU function to an LLVM function.
auto funcType = typeConverter->convertType(gpuFuncOp.getType())
.template cast<LLVM::LLVMPointerType>()
.getElementType();

// Remap proper input types.
TypeConverter::SignatureConversion signatureConversion(
gpuFuncOp.front().getNumArguments());
getTypeConverter()->convertFunctionSignature(
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);

// Create the new function operation. Only copy those attributes that are
// not specific to function modeling.
SmallVector<NamedAttribute, 4> attributes;
for (const auto &attr : gpuFuncOp.getAttrs()) {
if (attr.first == SymbolTable::getSymbolAttrName() ||
attr.first == impl::getTypeAttrName() ||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
continue;
attributes.push_back(attr);
}
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
if (gpuFuncOp.isKernel())
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, attributes);

{
// Insert operations that correspond to converted workgroup and private
// memory attributions to the body of the function. This must operate on
// the original function, before the body region is inlined in the new
// function to maintain the relation between block arguments and the
// parent operation that assigns their semantics.
OpBuilder::InsertionGuard guard(rewriter);

// Rewrite workgroup memory attributions to addresses of global buffers.
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();
auto i32Type = IntegerType::get(rewriter.getContext(), 32);

Value zero = nullptr;
if (!workgroupBuffers.empty())
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
rewriter.getI32IntegerAttr(0));
for (auto en : llvm::enumerate(workgroupBuffers)) {
LLVM::GlobalOp global = en.value();
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
auto elementType =
global.getType().cast<LLVM::LLVMArrayType>().getElementType();
Value memory = rewriter.create<LLVM::GEPOp>(
loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
address, ArrayRef<Value>{zero, zero});

// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
auto type = attribution.getType().cast<MemRefType>();
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + en.index(), descr);
}

// Rewrite private memory attributions to alloca'ed buffers.
unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions();
auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");

// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
auto ptrType = LLVM::LLVMPointerType::get(
typeConverter->convertType(type.getElementType())
.template cast<Type>(),
allocaAddrSpace);
Value numElements = rewriter.create<LLVM::ConstantOp>(
gpuFuncOp.getLoc(), int64Ty,
rewriter.getI64IntegerAttr(type.getNumElements()));
Value allocated = rewriter.create<LLVM::AllocaOp>(
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, allocated);
signatureConversion.remapInput(
numProperArguments + numWorkgroupAttributions + en.index(), descr);
}
}

// Move the region to the new function, update the entry block signature.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
llvmFuncOp.end());
if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter,
&signatureConversion)))
return failure();

rewriter.eraseOp(gpuFuncOp);
return success();
}
141 changes: 11 additions & 130 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
Expand Up @@ -11,145 +11,26 @@
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/Builders.h"
#include "llvm/Support/FormatVariadic.h"

namespace mlir {

template <unsigned AllocaAddrSpace>
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
using ConvertOpToLLVMPattern<gpu::GPUFuncOp>::ConvertOpToLLVMPattern;
GPUFuncOpLowering(LLVMTypeConverter &converter, unsigned allocaAddrSpace,
Identifier kernelAttributeName)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
kernelAttributeName(kernelAttributeName) {}

LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
assert(operands.empty() && "func op is not expected to have operands");
Location loc = gpuFuncOp.getLoc();

SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
Value attribution = en.value();

auto type = attribution.getType().dyn_cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");

uint64_t numElements = type.getNumElements();

auto elementType = typeConverter->convertType(type.getElementType())
.template cast<Type>();
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name = std::string(
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(),
gpu::GPUDialect::getWorkgroupAddressSpace());
workgroupBuffers.push_back(globalOp);
}

// Rewrite the original GPU function to an LLVM function.
auto funcType = typeConverter->convertType(gpuFuncOp.getType())
.template cast<LLVM::LLVMPointerType>()
.getElementType();

// Remap proper input types.
TypeConverter::SignatureConversion signatureConversion(
gpuFuncOp.front().getNumArguments());
getTypeConverter()->convertFunctionSignature(
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);

// Create the new function operation. Only copy those attributes that are
// not specific to function modeling.
SmallVector<NamedAttribute, 4> attributes;
for (const auto &attr : gpuFuncOp.getAttrs()) {
if (attr.first == SymbolTable::getSymbolAttrName() ||
attr.first == impl::getTypeAttrName() ||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
continue;
attributes.push_back(attr);
}
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, attributes);
ConversionPatternRewriter &rewriter) const override;

{
// Insert operations that correspond to converted workgroup and private
// memory attributions to the body of the function. This must operate on
// the original function, before the body region is inlined in the new
// function to maintain the relation between block arguments and the
// parent operation that assigns their semantics.
OpBuilder::InsertionGuard guard(rewriter);
private:
/// The address spcae to use for `alloca`s in private memory.
unsigned allocaAddrSpace;

// Rewrite workgroup memory attributions to addresses of global buffers.
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();
auto i32Type = IntegerType::get(rewriter.getContext(), 32);

Value zero = nullptr;
if (!workgroupBuffers.empty())
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
rewriter.getI32IntegerAttr(0));
for (auto en : llvm::enumerate(workgroupBuffers)) {
LLVM::GlobalOp global = en.value();
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
auto elementType =
global.getType().cast<LLVM::LLVMArrayType>().getElementType();
Value memory = rewriter.create<LLVM::GEPOp>(
loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()),
address, ArrayRef<Value>{zero, zero});

// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
auto type = attribution.getType().cast<MemRefType>();
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + en.index(), descr);
}

// Rewrite private memory attributions to alloca'ed buffers.
unsigned numWorkgroupAttributions =
gpuFuncOp.getNumWorkgroupAttributions();
auto int64Ty = IntegerType::get(rewriter.getContext(), 64);
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().cast<MemRefType>();
assert(type && type.hasStaticShape() &&
"unexpected type in attribution");

// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
auto ptrType = LLVM::LLVMPointerType::get(
typeConverter->convertType(type.getElementType())
.template cast<Type>(),
AllocaAddrSpace);
Value numElements = rewriter.create<LLVM::ConstantOp>(
gpuFuncOp.getLoc(), int64Ty,
rewriter.getI64IntegerAttr(type.getNumElements()));
Value allocated = rewriter.create<LLVM::AllocaOp>(
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, allocated);
signatureConversion.remapInput(
numProperArguments + numWorkgroupAttributions + en.index(), descr);
}
}

// Move the region to the new function, update the entry block signature.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
llvmFuncOp.end());
if (failed(rewriter.convertRegionTypes(
&llvmFuncOp.getBody(), *typeConverter, &signatureConversion)))
return failure();

rewriter.eraseOp(gpuFuncOp);
return success();
}
/// The attribute name to use instead of `gpu.kernel`.
Identifier kernelAttributeName;
};

struct GPUReturnOpLowering : public ConvertOpToLLVMPattern<gpu::ReturnOp> {
Expand Down
1 change: 1 addition & 0 deletions mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt
Expand Up @@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRGPUToNVVMTransforms

LINK_LIBS PUBLIC
MLIRGPU
MLIRGPUToGPURuntimeTransforms
MLIRLLVMIR
MLIRNVVMIR
MLIRPass
Expand Down
15 changes: 10 additions & 5 deletions mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Expand Up @@ -167,11 +167,16 @@ void mlir::populateGpuToNVVMConversionPatterns(
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
NVVM::GridDimYOp, NVVM::GridDimZOp>,
GPUShuffleOpLowering, GPUReturnOpLowering,
// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
GPUFuncOpLowering<0>>(converter);
GPUShuffleOpLowering, GPUReturnOpLowering>(converter);

// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
patterns.insert<GPUFuncOpLowering>(
converter, /*allocaAddrSpace=*/0,
Identifier::get(NVVM::NVVMDialect::getKernelFuncAttrName(),
&converter.getContext()));

patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
"__nv_fabs");
patterns.insert<OpToFuncCallLowering<math::AtanOp>>(converter, "__nv_atanf",
Expand Down
1 change: 1 addition & 0 deletions mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
Expand Up @@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRGPUToROCDLTransforms

LINK_LIBS PUBLIC
MLIRGPU
MLIRGPUToGPURuntimeTransforms
MLIRLLVMIR
MLIRROCDLIR
MLIRPass
Expand Down
6 changes: 5 additions & 1 deletion mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Expand Up @@ -103,7 +103,11 @@ void mlir::populateGpuToROCDLConversionPatterns(
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
GPUReturnOpLowering>(converter);
patterns.insert<GPUFuncOpLowering>(
converter, /*allocaAddrSpace=*/5,
Identifier::get(ROCDL::ROCDLDialect::getKernelFuncAttrName(),
&converter.getContext()));
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
"__ocml_fabs_f64");
patterns.insert<OpToFuncCallLowering<math::AtanOp>>(
Expand Down

0 comments on commit 4c4876c

Please sign in to comment.