diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h index f61e40ef5f969..4eb6379adf6e7 100644 --- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h +++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h @@ -19,17 +19,12 @@ namespace mlir { class Location; class ModuleOp; -template -class OpPassBase; - -namespace gpu { -class GPUModuleOp; -} // namespace gpu - namespace LLVM { class LLVMDialect; } // namespace LLVM +template class OpPassBase; + using OwnedCubin = std::unique_ptr>; using CubinGenerator = std::function; @@ -43,7 +38,7 @@ using CubinGenerator = /// attached as a string attribute named 'nvvm.cubin' to the kernel function. /// After the transformation, the body of the kernel function is removed (i.e., /// it is turned into a declaration). -std::unique_ptr> +std::unique_ptr> createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator); /// Creates a pass to convert a gpu.launch_func operation into a sequence of diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h index b3212279fab98..75e4f7e374c6a 100644 --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -14,19 +14,15 @@ namespace mlir { class LLVMTypeConverter; class OwningRewritePatternList; -template -class OpPassBase; - -namespace gpu { -class GPUModuleOp; -} +class ModuleOp; +template class OpPassBase; /// Collect a set of patterns to convert from the GPU dialect to NVVM. void populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, OwningRewritePatternList &patterns); /// Creates a pass that lowers GPU dialect operations to NVVM counterparts. -std::unique_ptr> createLowerGpuOpsToNVVMOpsPass(); +std::unique_ptr> createLowerGpuOpsToNVVMOpsPass(); } // namespace mlir diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 3df6ff4be0c1d..766ddbf202c25 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -588,56 +588,4 @@ def GPU_BarrierOp : GPU_Op<"barrier"> { let printer = [{ p << getOperationName(); }]; } -def GPU_GPUModuleOp : GPU_Op<"module", [ - IsolatedFromAbove, SymbolTable, Symbol, - SingleBlockImplicitTerminator<"ModuleEndOp"> -]> { - let summary = "A top level compilation unit containing code to be run on a GPU."; - let description = [{ - GPU module contains code that is intended to be run on a GPU. A host device - can launch this code through a gpu.launc_func that creates a fully - qualified symbol through the gpu.module's symbol and a gpu.func symbol - contained in the gpu.module. - - The module's top-level scope is modeled by a single region with a single - block. GPU modules are required to have a name that is used for symbol - resolution by the gpu.launch_func operation. - - Using an op with a region to define a GPU module enables "embedding" GPU - modules with SIMT execution models in other dialects in a clean manner and - allows filtering of code regions to execute passes on only code intended to - or not intended to be run on the separate device. - - ``` - gpu.module @symbol_name { - gpu.func {} - ... - gpu.module_end - } - - ``` - }]; - let builders = [OpBuilder<"Builder *builder, OperationState &result, " - "StringRef name">]; - let parser = [{ return ::parseGPUModuleOp(parser, result); }]; - let printer = [{ return ::print(p, *this); }]; - let regions = (region SizedRegion<1>:$body); - - // We need to ensure the block inside the region is properly terminated; - // the auto-generated builders do not guarantee that. - let skipDefaultBuilders = 1; -} - -def GPU_ModuleEndOp : GPU_Op<"module_end", [ - Terminator, HasParent<"GPUModuleOp"> -]> { - let summary = "A pseudo op that marks the end of a gpu.module."; - let description = [{ - This op terminates the only block inside the only region of a `gpu.module`. - }]; - - let parser = [{ return success(); }]; - let printer = [{ p << getOperationName(); }]; -} - #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp index b111c96313c2c..66a2e66f99a4e 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp @@ -46,15 +46,18 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin"; /// IR and further to PTX. A user provided CubinGenerator compiles the PTX to /// GPU binary code, which is then attached as an attribute to the function. The /// function body is erased. -class GpuKernelToCubinPass - : public OperationPass { +class GpuKernelToCubinPass : public ModulePass { public: GpuKernelToCubinPass( CubinGenerator cubinGenerator = compilePtxToCubinForTesting) : cubinGenerator(cubinGenerator) {} - void runOnOperation() override { - gpu::GPUModuleOp module = getOperation(); + void runOnModule() override { + ModuleOp module = getModule(); + if (!module.getAttrOfType( + gpu::GPUDialect::getKernelModuleAttrName()) || + !module.getName()) + return; // Make sure the NVPTX target is initialized. LLVMInitializeNVPTXTarget(); @@ -68,8 +71,8 @@ class GpuKernelToCubinPass // Translate the module to CUBIN and attach the result as attribute to the // module. - if (auto cubinAttr = translateGPUModuleToCubinAnnotation( - *llvmModule, module.getLoc(), module.getName())) + if (auto cubinAttr = translateGpuModuleToCubinAnnotation( + *llvmModule, module.getLoc(), *module.getName())) module.setAttr(kCubinAnnotation, cubinAttr); else signalPassFailure(); @@ -89,7 +92,7 @@ class GpuKernelToCubinPass StringRef name); /// Translates llvmModule to cubin and returns the result as attribute. - StringAttr translateGPUModuleToCubinAnnotation(llvm::Module &llvmModule, + StringAttr translateGpuModuleToCubinAnnotation(llvm::Module &llvmModule, Location loc, StringRef name); CubinGenerator cubinGenerator; @@ -146,7 +149,7 @@ OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, return cubinGenerator(ptx, loc, name); } -StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation( +StringAttr GpuKernelToCubinPass::translateGpuModuleToCubinAnnotation( llvm::Module &llvmModule, Location loc, StringRef name) { auto cubin = convertModuleToCubin(llvmModule, loc, name); if (!cubin) @@ -154,7 +157,7 @@ StringAttr GpuKernelToCubinPass::translateGPUModuleToCubinAnnotation( return StringAttr::get({cubin->data(), cubin->size()}, loc->getContext()); } -std::unique_ptr> +std::unique_ptr> mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { return std::make_unique(cubinGenerator); } diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp index 31024d2881b5c..41f69d6e21d1b 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -132,9 +132,9 @@ class GpuLaunchFuncToCudaCallsPass // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN data. - for (auto m : - llvm::make_early_inc_range(getModule().getOps())) - m.erase(); + for (auto m : llvm::make_early_inc_range(getModule().getOps())) + if (m.getAttrOfType(gpu::GPUDialect::getKernelModuleAttrName())) + m.erase(); } private: @@ -343,8 +343,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( builder.getI32IntegerAttr(0)); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. - auto kernelModule = getModule().lookupSymbol( - launchOp.getKernelModuleName()); + auto kernelModule = + getModule().lookupSymbol(launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto cubinAttr = kernelModule.getAttrOfType(kCubinAnnotation); @@ -354,7 +354,8 @@ void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( return signalPassFailure(); } - SmallString<128> nameBuffer(kernelModule.getName()); + assert(kernelModule.getName() && "expected a named module"); + SmallString<128> nameBuffer(*kernelModule.getName()); nameBuffer.append(kCubinStorageSuffix); Value data = LLVM::createGlobalString( loc, builder, nameBuffer.str(), cubinAttr.getValue(), diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 84bc7ff1d5f52..e2b1e0e533c53 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -200,7 +200,7 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering { auto type = operand.getType().cast(); // Create shared memory array to store the warp reduction. - auto module = operand.getDefiningOp()->getParentOfType(); + auto module = operand.getDefiningOp()->getParentOfType(); assert(module && "op must belong to a module"); Value sharedMemPtr = createSharedMemoryArray(loc, module, type, kWarpSize, rewriter); @@ -391,10 +391,10 @@ struct GPUAllReduceOpLowering : public LLVMOpLowering { } /// Creates a global array stored in shared memory. - Value createSharedMemoryArray(Location loc, gpu::GPUModuleOp module, + Value createSharedMemoryArray(Location loc, ModuleOp module, LLVM::LLVMType elementType, int numElements, ConversionPatternRewriter &rewriter) const { - OpBuilder builder(module.body()); + OpBuilder builder(module.getBodyRegion()); auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); StringRef name = "reduce_buffer"; @@ -699,11 +699,13 @@ struct GPUReturnOpLowering : public LLVMOpLowering { /// /// This pass only handles device code and is not meant to be run on GPU host /// code. -class LowerGpuOpsToNVVMOpsPass - : public OperationPass { +class LowerGpuOpsToNVVMOpsPass : public ModulePass { public: - void runOnOperation() override { - gpu::GPUModuleOp m = getOperation(); + void runOnModule() override { + ModuleOp m = getModule(); + if (!m.getAttrOfType(gpu::GPUDialect::getKernelModuleAttrName())) + return; + OwningRewritePatternList patterns; NVVMTypeConverter converter(m.getContext()); populateStdToLLVMConversionPatterns(converter, patterns); @@ -716,7 +718,7 @@ class LowerGpuOpsToNVVMOpsPass target.addLegalDialect(); target.addLegalDialect(); // TODO(csigg): Remove once we support replacing non-root ops. - target.addLegalOp(); + target.addLegalOp(); if (failed(applyPartialConversion(m, target, patterns, &converter))) signalPassFailure(); } @@ -748,8 +750,7 @@ void mlir::populateGpuToNVVMConversionPatterns( "__nv_exp"); } -std::unique_ptr> -mlir::createLowerGpuOpsToNVVMOpsPass() { +std::unique_ptr> mlir::createLowerGpuOpsToNVVMOpsPass() { return std::make_unique(); } diff --git a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt index adeb4e099ab9e..be82894461d61 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt @@ -1,15 +1,8 @@ -set(LLVM_TARGET_DEFINITIONS GPUToSPIRV.td) -mlir_tablegen(GPUToSPIRV.cpp.inc -gen-rewriters) -add_public_tablegen_target(MLIRGPUToSPIRVIncGen) - add_llvm_library(MLIRGPUtoSPIRVTransforms ConvertGPUToSPIRV.cpp ConvertGPUToSPIRVPass.cpp ) -add_dependencies(MLIRGPUtoSPIRVTransforms - MLIRGPUToSPIRVIncGen) - target_link_libraries(MLIRGPUtoSPIRVTransforms MLIRGPU MLIRIR diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp index a90cea99be492..2fd8cedfd63b5 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp @@ -63,13 +63,27 @@ class KernelFnConversion final : public SPIRVOpLowering { SmallVector workGroupSizeAsInt32; }; -/// Pattern to convert a gpu.module to a spv.module. -class GPUModuleConversion final : public SPIRVOpLowering { +/// Pattern to convert a module with gpu.kernel_module attribute to a +/// spv.module. +class KernelModuleConversion final : public SPIRVOpLowering { public: - using SPIRVOpLowering::SPIRVOpLowering; + using SPIRVOpLowering::SPIRVOpLowering; PatternMatchResult - matchAndRewrite(gpu::GPUModuleOp moduleOp, ArrayRef operands, + matchAndRewrite(ModuleOp moduleOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override; +}; + +/// Pattern to convert a module terminator op to a terminator of spv.module op. +// TODO: Move this into DRR, but that requires ModuleTerminatorOp to be defined +// in ODS. +class KernelModuleTerminatorConversion final + : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + PatternMatchResult + matchAndRewrite(ModuleTerminatorOp terminatorOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; }; @@ -270,12 +284,16 @@ KernelFnConversion::matchAndRewrite(gpu::GPUFuncOp funcOp, } //===----------------------------------------------------------------------===// -// ModuleOp with gpu.module. +// ModuleOp with gpu.kernel_module. //===----------------------------------------------------------------------===// -PatternMatchResult GPUModuleConversion::matchAndRewrite( - gpu::GPUModuleOp moduleOp, ArrayRef operands, +PatternMatchResult KernelModuleConversion::matchAndRewrite( + ModuleOp moduleOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { + if (!moduleOp.getAttrOfType( + gpu::GPUDialect::getKernelModuleAttrName())) { + return matchFailure(); + } // TODO : Generalize this to account for different extensions, // capabilities, extended_instruction_sets, other addressing models // and memory models. @@ -284,8 +302,8 @@ PatternMatchResult GPUModuleConversion::matchAndRewrite( spirv::MemoryModel::GLSL450, spirv::Capability::Shader, spirv::Extension::SPV_KHR_storage_buffer_storage_class); // Move the region from the module op into the SPIR-V module. - Region &spvModuleRegion = spvModule.body(); - rewriter.inlineRegionBefore(moduleOp.body(), spvModuleRegion, + Region &spvModuleRegion = spvModule.getOperation()->getRegion(0); + rewriter.inlineRegionBefore(moduleOp.getBodyRegion(), spvModuleRegion, spvModuleRegion.begin()); // The spv.module build method adds a block with a terminator. Remove that // block. The terminator of the module op in the remaining block will be @@ -295,6 +313,17 @@ PatternMatchResult GPUModuleConversion::matchAndRewrite( return matchSuccess(); } +//===----------------------------------------------------------------------===// +// ModuleTerminatorOp for gpu.kernel_module. +//===----------------------------------------------------------------------===// + +PatternMatchResult KernelModuleTerminatorConversion::matchAndRewrite( + ModuleTerminatorOp terminatorOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + rewriter.replaceOpWithNewOp(terminatorOp); + return matchSuccess(); +} + //===----------------------------------------------------------------------===// // GPU return inside kernel functions to SPIR-V return. //===----------------------------------------------------------------------===// @@ -313,18 +342,14 @@ PatternMatchResult GPUReturnOpConversion::matchAndRewrite( // GPU To SPIRV Patterns. //===----------------------------------------------------------------------===// -namespace { -#include "GPUToSPIRV.cpp.inc" -} - void mlir::populateGPUToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, OwningRewritePatternList &patterns, ArrayRef workGroupSize) { - populateWithGenerated(context, &patterns); patterns.insert(context, typeConverter, workGroupSize); patterns.insert< - GPUReturnOpConversion, ForOpConversion, GPUModuleConversion, + GPUReturnOpConversion, ForOpConversion, KernelModuleConversion, + KernelModuleTerminatorConversion, LaunchConfigConversion, LaunchConfigConversion, LaunchConfigConversion, diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp index 4dda8bdc2b39f..9b758d1052c76 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp @@ -60,12 +60,15 @@ void GPUToSPIRVPass::runOnModule() { SmallVector kernelModules; OpBuilder builder(context); - module.walk([&builder, &kernelModules](gpu::GPUModuleOp moduleOp) { - // For each kernel module (should be only 1 for now, but that is not a - // requirement here), clone the module for conversion because the - // gpu.launch function still needs the kernel module. - builder.setInsertionPoint(moduleOp.getOperation()); - kernelModules.push_back(builder.clone(*moduleOp.getOperation())); + module.walk([&builder, &kernelModules](ModuleOp moduleOp) { + if (moduleOp.getAttrOfType( + gpu::GPUDialect::getKernelModuleAttrName())) { + // For each kernel module (should be only 1 for now, but that is not a + // requirement here), clone the module for conversion because the + // gpu.launch function still needs the kernel module. + builder.setInsertionPoint(moduleOp.getOperation()); + kernelModules.push_back(builder.clone(*moduleOp.getOperation())); + } }); SPIRVTypeConverter typeConverter; diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td deleted file mode 100644 index cfe9d26273cc4..0000000000000 --- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.td +++ /dev/null @@ -1,22 +0,0 @@ -//===-- GPUToSPIRV.td - GPU to SPIR-V Dialect Lowerings ----*- tablegen -*-===// -// -// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains patterns to lower GPU dialect ops to to SPIR-V ops. -// -//===----------------------------------------------------------------------===// - - -#ifndef CONVERT_GPU_TO_SPIRV -#define CONVERT_GPU_TO_SPIRV - -include "mlir/Dialect/GPU/GPUOps.td" -include "mlir/Dialect/SPIRV/SPIRVStructureOps.td" - -def : Pat<(GPU_ModuleEndOp), (SPV_ModuleEndOp)>; - -#endif // CONVERT_GPU_TO_SPIRV diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 083e282602015..e5bcc69734474 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -72,10 +72,15 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, // Check that `launch_func` refers to a well-formed GPU kernel module. StringRef kernelModuleName = launchOp.getKernelModuleName(); - auto kernelModule = module.lookupSymbol(kernelModuleName); + auto kernelModule = module.lookupSymbol(kernelModuleName); if (!kernelModule) return launchOp.emitOpError() << "kernel module '" << kernelModuleName << "' is undefined"; + if (!kernelModule.getAttrOfType( + GPUDialect::getKernelModuleAttrName())) + return launchOp.emitOpError("module '") + << kernelModuleName << "' is missing the '" + << GPUDialect::getKernelModuleAttrName() << "' attribute"; // Check that `launch_func` refers to a well-formed kernel function. StringRef kernelName = launchOp.kernel(); @@ -512,9 +517,10 @@ void LaunchFuncOp::build(Builder *builder, OperationState &result, result.addOperands(kernelOperands); result.addAttribute(getKernelAttrName(), builder->getStringAttr(kernelFunc.getName())); - auto kernelModule = kernelFunc.getParentOfType(); - result.addAttribute(getKernelModuleAttrName(), - builder->getSymbolRefAttr(kernelModule.getName())); + auto kernelModule = kernelFunc.getParentOfType(); + if (Optional kernelModuleName = kernelModule.getName()) + result.addAttribute(getKernelModuleAttrName(), + builder->getSymbolRefAttr(*kernelModuleName)); } void LaunchFuncOp::build(Builder *builder, OperationState &result, @@ -814,47 +820,6 @@ LogicalResult GPUFuncOp::verifyBody() { return success(); } -//===----------------------------------------------------------------------===// -// GPUModuleOp -//===----------------------------------------------------------------------===// - -void GPUModuleOp::build(Builder *builder, OperationState &result, - StringRef name) { - ensureTerminator(*result.addRegion(), *builder, result.location); - result.attributes.push_back(builder->getNamedAttr( - ::mlir::SymbolTable::getSymbolAttrName(), builder->getStringAttr(name))); -} - -static ParseResult parseGPUModuleOp(OpAsmParser &parser, - OperationState &result) { - StringAttr nameAttr; - if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(), - result.attributes)) - return failure(); - - // If module attributes are present, parse them. - if (parser.parseOptionalAttrDictWithKeyword(result.attributes)) - return failure(); - - // Parse the module body. - auto *body = result.addRegion(); - if (parser.parseRegion(*body, None, None)) - return failure(); - - // Ensure that this module has a valid terminator. - GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location); - return success(); -} - -static void print(OpAsmPrinter &p, GPUModuleOp op) { - p << op.getOperationName() << ' '; - p.printSymbolName(op.getName()); - p.printOptionalAttrDictWithKeyword(op.getAttrs(), - {SymbolTable::getSymbolAttrName()}); - p.printRegion(op.getOperation()->getRegion(0), /*printEntryBlockArgs=*/false, - /*printBlockTerminators=*/false); -} - // Namespace avoids ambiguous ReturnOpOperandAdaptor. namespace mlir { namespace gpu { diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index 0f8e2253980c2..37f9c2e7b843f 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -140,8 +140,8 @@ namespace { /// inside a nested module. It also creates an external function of the same /// name in the parent module. /// -/// The gpu.modules are intended to be compiled to a cubin blob independently in -/// a separate pass. The external functions can then be annotated with the +/// The kernel modules are intended to be compiled to a cubin blob independently +/// in a separate pass. The external functions can then be annotated with the /// symbol of the cubin accessor function. class GpuKernelOutliningPass : public ModulePass { public: @@ -174,19 +174,15 @@ class GpuKernelOutliningPass : public ModulePass { } private: - // Returns a gpu.module containing kernelFunc and all callees (recursive). - gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, - const SymbolTable &parentSymbolTable) { - // TODO: This code cannot use an OpBuilder because it must be inserted into - // a SymbolTable by the caller. SymbolTable needs to be refactored to - // prevent manual building of Ops with symbols in code using SymbolTables - // and then this needs to use the OpBuilder. + // Returns a module containing kernelFunc and all callees (recursive). + ModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, + const SymbolTable &parentSymbolTable) { auto context = getModule().getContext(); Builder builder(context); - OperationState state(kernelFunc.getLoc(), - gpu::GPUModuleOp::getOperationName()); - gpu::GPUModuleOp::build(&builder, state, kernelFunc.getName()); - auto kernelModule = cast(Operation::create(state)); + auto kernelModule = + ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); + kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), + builder.getUnitAttr()); SymbolTable symbolTable(kernelModule); symbolTable.insert(kernelFunc); diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir index 707f4a0639587..6865462595f74 100644 --- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir @@ -5,7 +5,7 @@ module attributes {gpu.container_module} { // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00") // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN") - gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} { + module @kernel_module attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} { gpu.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} { gpu.return } diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir index 78b9f56b6202c..62fe2b993388b 100644 --- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s --test-kernel-to-cubin -split-input-file | FileCheck %s -// CHECK: attributes {nvvm.cubin = "CUBIN"} -gpu.module @foo { +// CHECK: attributes {gpu.kernel_module, nvvm.cubin = "CUBIN"} +module @foo attributes {gpu.kernel_module} { llvm.func @kernel(%arg0 : !llvm.float, %arg1 : !llvm<"float*">) // CHECK: attributes {gpu.kernel} attributes { gpu.kernel } { @@ -11,7 +11,7 @@ gpu.module @foo { // ----- -gpu.module @bar { +module @bar attributes {gpu.kernel_module} { // CHECK: func @kernel_a llvm.func @kernel_a() attributes { gpu.kernel } { diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index 7f69cb7482c54..24bf56557c3e2 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK-LABEL: func @gpu_index_ops() func @gpu_index_ops() attributes { gpu.kernel } { @@ -38,7 +38,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK-LABEL: func @gpu_all_reduce_op() func @gpu_all_reduce_op() attributes { gpu.kernel } { @@ -55,7 +55,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK-LABEL: func @gpu_all_reduce_region() func @gpu_all_reduce_region() attributes { gpu.kernel } { @@ -74,7 +74,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK-LABEL: func @gpu_shuffle() func @gpu_shuffle() attributes { gpu.kernel } { @@ -99,7 +99,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK-LABEL: func @gpu_sync() func @gpu_sync() attributes { gpu.kernel } { @@ -111,7 +111,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK: llvm.func @__nv_fabsf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_fabs(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_fabs @@ -126,7 +126,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK: llvm.func @__nv_ceilf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_ceil(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_ceil @@ -141,7 +141,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK: llvm.func @__nv_cosf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_cos(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_cos @@ -156,7 +156,7 @@ gpu.module @test_module { // ----- -gpu.module @test_module { +module attributes {gpu.kernel_module} { // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float // CHECK: llvm.func @__nv_exp(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp @@ -174,7 +174,7 @@ gpu.module @test_module { // ----- // Test that we handled properly operation with SymbolTable other than module op -gpu.module @test_module { +module attributes {gpu.kernel_module} { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @__nv_expf(!llvm.float) -> !llvm.float diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir index 115c71d128004..69a16b25139e4 100644 --- a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir +++ b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s -gpu.module @kernel { +module attributes {gpu.kernel_module} { // CHECK-LABEL: llvm.func @private gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) { // Allocate private memory inside the function. @@ -32,7 +32,7 @@ gpu.module @kernel { // ----- -gpu.module @kernel { +module attributes {gpu.kernel_module} { // Workgroup buffers are allocated as globals. // CHECK: llvm.mlir.global internal @[[buffer:.*]]() // CHECK-SAME: addr_space = 3 @@ -72,7 +72,7 @@ gpu.module @kernel { // ----- -gpu.module @kernel { +module attributes {gpu.kernel_module} { // Check that the total size was computed correctly. // CHECK: llvm.mlir.global internal @[[buffer:.*]]() // CHECK-SAME: addr_space = 3 @@ -113,7 +113,7 @@ gpu.module @kernel { // ----- -gpu.module @kernel { +module attributes {gpu.kernel_module} { // Check that several buffers are defined. // CHECK: llvm.mlir.global internal @[[buffer1:.*]]() // CHECK-SAME: !llvm<"[1 x float]"> diff --git a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir index 7f4081e4eda0c..c0a68a9db2af4 100644 --- a/mlir/test/Conversion/GPUToSPIRV/builtins.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/builtins.mlir @@ -9,7 +9,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @builtin_workgroup_id_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -32,7 +32,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @builtin_workgroup_id_y() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -55,7 +55,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPID:@.*]] built_in("WorkgroupId") - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @builtin_workgroup_id_z() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPID]] @@ -78,7 +78,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize") - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @builtin_workgroup_size_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[WORKGROUPSIZE]] @@ -101,7 +101,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[LOCALINVOCATIONID:@.*]] built_in("LocalInvocationId") - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @builtin_local_id_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[LOCALINVOCATIONID]] @@ -124,7 +124,7 @@ module attributes {gpu.container_module} { // CHECK-LABEL: spv.module "Logical" "GLSL450" // CHECK: spv.globalVariable [[NUMWORKGROUPS:@.*]] built_in("NumWorkgroups") - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @builtin_num_workgroups_x() attributes {gpu.kernel} { // CHECK: [[ADDRESS:%.*]] = spv._address_of [[NUMWORKGROUPS]] diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir index 446c0d602ed3c..d104c96cfa9d9 100644 --- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir @@ -16,7 +16,7 @@ module attributes {gpu.container_module} { } // CHECK-LABEL: spv.module "Logical" "GLSL450" - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZEVAR:@.*]] built_in("WorkgroupSize") : !spv.ptr, Input> // CHECK-DAG: spv.globalVariable [[NUMWORKGROUPSVAR:@.*]] built_in("NumWorkgroups") : !spv.ptr, Input> // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr, Input> diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir index bd97315a2ea4c..6d38360b7e84a 100644 --- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir @@ -7,7 +7,7 @@ module attributes {gpu.container_module} { return } - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @loop_kernel(%arg2 : memref<10xf32>, %arg3 : memref<10xf32>) attributes {gpu.kernel} { // CHECK: [[LB:%.*]] = spv.constant 4 : i32 diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir index cca5eb9d0b492..e1b687c1a0b0d 100644 --- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir @@ -2,7 +2,7 @@ module attributes {gpu.container_module} { - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { // CHECK: spv.module "Logical" "GLSL450" { // CHECK-LABEL: func @kernel_1 // CHECK-SAME: {{%.*}}: f32 {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir index 8f900bf6b5cef..8323fdf8709e2 100644 --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -167,7 +167,7 @@ module attributes {gpu.container_module} { } func @launch_func_missing_module_attribute(%sz : index) { - // expected-error@+1 {{kernel module 'kernels' is undefined}} + // expected-error@+1 {{module 'kernels' is missing the 'gpu.kernel_module' attribute}} "gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz) { kernel = "kernel_1", kernel_module = @kernels } : (index, index, index, index, index, index) -> () @@ -178,7 +178,8 @@ module attributes {gpu.container_module} { // ----- module attributes {gpu.container_module} { - gpu.module @kernels { } + module @kernels attributes {gpu.kernel_module} { + } func @launch_func_undefined_function(%sz : index) { // expected-error@+1 {{kernel function 'kernel_1' is undefined}} @@ -192,7 +193,7 @@ module attributes {gpu.container_module} { // ----- module attributes {gpu.container_module} { - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @kernel_1(%arg1 : !llvm<"float*">) kernel { gpu.return } @@ -210,7 +211,7 @@ module attributes {gpu.container_module} { // ----- module attributes {gpu.container_module} { - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } { gpu.return } @@ -228,7 +229,7 @@ module attributes {gpu.container_module} { // ----- -gpu.module @kernels { +module @kernels attributes {gpu.kernel_module} { gpu.func @kernel_1(%arg1 : !llvm<"float*">) attributes { gpu.kernel } { gpu.return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index 033e7cbcb7e11..1dd08cea492a0 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -60,7 +60,7 @@ module attributes {gpu.container_module} { return } - gpu.module @kernels { + module @kernels attributes {gpu.kernel_module} { gpu.func @kernel_1(%arg0 : f32, %arg1 : memref) attributes {gpu.kernel} { %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 425b4b3090c5f..5adb881a1dc6a 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -136,7 +136,7 @@ func @recursive_device_function() { gpu.return } -// CHECK: gpu.module @function_call_kernel { +// CHECK: module @function_call_kernel attributes {gpu.kernel_module} { // CHECK: gpu.func @function_call_kernel() // CHECK: call @device_function() : () -> () // CHECK: call @device_function() : () -> () diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp index a05016f48e86e..d6160d6d6e0c8 100644 --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -105,7 +105,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) { applyPassManagerCLOptions(pm); pm.addPass(createGpuKernelOutliningPass()); - auto &kernelPm = pm.nest(); + auto &kernelPm = pm.nest(); kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin)); pm.addPass(createLowerToLLVMPass());