Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[mlir] Use target-specific GPU kernel attributes in lowering pipelines
Until now, the GPU translation to NVVM or ROCDL intrinsics relied on the presence of the generic `gpu.kernel` attribute to attach additional LLVM IR metadata to the relevant functions. This would be problematic if each dialect were to handle the conversion of its own options, which is the intended direction for the translation infrastructure. Introduce `nvvm.kernel` and `rocdl.kernel` in addition to `gpu.kernel` and base translation on these new attributes instead. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D96591
- Loading branch information
Showing
15 changed files
with
218 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "GPUOpsLowering.h" | ||
#include "mlir/Dialect/StandardOps/IR/Ops.h" | ||
#include "mlir/IR/Builders.h" | ||
#include "llvm/Support/FormatVariadic.h" | ||
|
||
using namespace mlir; | ||
|
||
LogicalResult | ||
GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, | ||
ArrayRef<Value> operands, | ||
ConversionPatternRewriter &rewriter) const { | ||
assert(operands.empty() && "func op is not expected to have operands"); | ||
Location loc = gpuFuncOp.getLoc(); | ||
|
||
SmallVector<LLVM::GlobalOp, 3> workgroupBuffers; | ||
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); | ||
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { | ||
Value attribution = en.value(); | ||
|
||
auto type = attribution.getType().dyn_cast<MemRefType>(); | ||
assert(type && type.hasStaticShape() && "unexpected type in attribution"); | ||
|
||
uint64_t numElements = type.getNumElements(); | ||
|
||
auto elementType = | ||
typeConverter->convertType(type.getElementType()).template cast<Type>(); | ||
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements); | ||
std::string name = std::string( | ||
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); | ||
auto globalOp = rewriter.create<LLVM::GlobalOp>( | ||
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, | ||
LLVM::Linkage::Internal, name, /*value=*/Attribute(), | ||
gpu::GPUDialect::getWorkgroupAddressSpace()); | ||
workgroupBuffers.push_back(globalOp); | ||
} | ||
|
||
// Rewrite the original GPU function to an LLVM function. | ||
auto funcType = typeConverter->convertType(gpuFuncOp.getType()) | ||
.template cast<LLVM::LLVMPointerType>() | ||
.getElementType(); | ||
|
||
// Remap proper input types. | ||
TypeConverter::SignatureConversion signatureConversion( | ||
gpuFuncOp.front().getNumArguments()); | ||
getTypeConverter()->convertFunctionSignature( | ||
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); | ||
|
||
// Create the new function operation. Only copy those attributes that are | ||
// not specific to function modeling. | ||
SmallVector<NamedAttribute, 4> attributes; | ||
for (const auto &attr : gpuFuncOp.getAttrs()) { | ||
if (attr.first == SymbolTable::getSymbolAttrName() || | ||
attr.first == impl::getTypeAttrName() || | ||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) | ||
continue; | ||
attributes.push_back(attr); | ||
} | ||
// Add a dialect specific kernel attribute in addition to GPU kernel | ||
// attribute. The former is necessary for further translation while the | ||
// latter is expected by gpu.launch_func. | ||
if (gpuFuncOp.isKernel()) | ||
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr()); | ||
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>( | ||
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, | ||
LLVM::Linkage::External, attributes); | ||
|
||
{ | ||
// Insert operations that correspond to converted workgroup and private | ||
// memory attributions to the body of the function. This must operate on | ||
// the original function, before the body region is inlined in the new | ||
// function to maintain the relation between block arguments and the | ||
// parent operation that assigns their semantics. | ||
OpBuilder::InsertionGuard guard(rewriter); | ||
|
||
// Rewrite workgroup memory attributions to addresses of global buffers. | ||
rewriter.setInsertionPointToStart(&gpuFuncOp.front()); | ||
unsigned numProperArguments = gpuFuncOp.getNumArguments(); | ||
auto i32Type = IntegerType::get(rewriter.getContext(), 32); | ||
|
||
Value zero = nullptr; | ||
if (!workgroupBuffers.empty()) | ||
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type, | ||
rewriter.getI32IntegerAttr(0)); | ||
for (auto en : llvm::enumerate(workgroupBuffers)) { | ||
LLVM::GlobalOp global = en.value(); | ||
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global); | ||
auto elementType = | ||
global.getType().cast<LLVM::LLVMArrayType>().getElementType(); | ||
Value memory = rewriter.create<LLVM::GEPOp>( | ||
loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()), | ||
address, ArrayRef<Value>{zero, zero}); | ||
|
||
// Build a memref descriptor pointing to the buffer to plug with the | ||
// existing memref infrastructure. This may use more registers than | ||
// otherwise necessary given that memref sizes are fixed, but we can try | ||
// and canonicalize that away later. | ||
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; | ||
auto type = attribution.getType().cast<MemRefType>(); | ||
auto descr = MemRefDescriptor::fromStaticShape( | ||
rewriter, loc, *getTypeConverter(), type, memory); | ||
signatureConversion.remapInput(numProperArguments + en.index(), descr); | ||
} | ||
|
||
// Rewrite private memory attributions to alloca'ed buffers. | ||
unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions(); | ||
auto int64Ty = IntegerType::get(rewriter.getContext(), 64); | ||
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { | ||
Value attribution = en.value(); | ||
auto type = attribution.getType().cast<MemRefType>(); | ||
assert(type && type.hasStaticShape() && "unexpected type in attribution"); | ||
|
||
// Explicitly drop memory space when lowering private memory | ||
// attributions since NVVM models it as `alloca`s in the default | ||
// memory space and does not support `alloca`s with addrspace(5). | ||
auto ptrType = LLVM::LLVMPointerType::get( | ||
typeConverter->convertType(type.getElementType()) | ||
.template cast<Type>(), | ||
allocaAddrSpace); | ||
Value numElements = rewriter.create<LLVM::ConstantOp>( | ||
gpuFuncOp.getLoc(), int64Ty, | ||
rewriter.getI64IntegerAttr(type.getNumElements())); | ||
Value allocated = rewriter.create<LLVM::AllocaOp>( | ||
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); | ||
auto descr = MemRefDescriptor::fromStaticShape( | ||
rewriter, loc, *getTypeConverter(), type, allocated); | ||
signatureConversion.remapInput( | ||
numProperArguments + numWorkgroupAttributions + en.index(), descr); | ||
} | ||
} | ||
|
||
// Move the region to the new function, update the entry block signature. | ||
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), | ||
llvmFuncOp.end()); | ||
if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter, | ||
&signatureConversion))) | ||
return failure(); | ||
|
||
rewriter.eraseOp(gpuFuncOp); | ||
return success(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.