diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 6a005e67ca95b..eeb8fbbb180ba 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, // Add a dialect specific kernel attribute in addition to GPU kernel // attribute. The former is necessary for further translation while the // latter is expected by gpu.launch_func. - if (gpuFuncOp.isKernel()) + if (gpuFuncOp.isKernel()) { attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr()); + + // Set the block size attribute if it is present. + if (kernelBlockSizeAttributeName.has_value()) { + std::optional dimX = + gpuFuncOp.getKnownBlockSize(gpu::Dimension::x); + std::optional dimY = + gpuFuncOp.getKnownBlockSize(gpu::Dimension::y); + std::optional dimZ = + gpuFuncOp.getKnownBlockSize(gpu::Dimension::z); + if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) { + // If any of the dimensions are missing, fill them in with 1. + attributes.emplace_back( + kernelBlockSizeAttributeName.value(), + rewriter.getI32ArrayAttr( + {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)})); + } + } + } auto llvmFuncOp = rewriter.create( gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C, diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index a77db4a036bad..471a688e85463 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -36,13 +36,15 @@ struct GPUDynamicSharedMemoryOpLowering }; struct GPUFuncOpLowering : ConvertOpToLLVMPattern { - GPUFuncOpLowering(const LLVMTypeConverter &converter, - unsigned allocaAddrSpace, unsigned workgroupAddrSpace, - StringAttr kernelAttributeName) + GPUFuncOpLowering( + const LLVMTypeConverter &converter, unsigned allocaAddrSpace, + unsigned workgroupAddrSpace, StringAttr kernelAttributeName, + std::optional kernelBlockSizeAttributeName = std::nullopt) : ConvertOpToLLVMPattern(converter), allocaAddrSpace(allocaAddrSpace), workgroupAddrSpace(workgroupAddrSpace), - kernelAttributeName(kernelAttributeName) {} + kernelAttributeName(kernelAttributeName), + kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {} LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, @@ -56,6 +58,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern { /// The attribute name to use instead of `gpu.kernel`. StringAttr kernelAttributeName; + + /// The attribute name to to set block size + std::optional kernelBlockSizeAttributeName; }; /// The lowering of gpu.printf to a call to HIP hostcalls diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index e60fe5cbd7603..a7ac2332961ae 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, /*workgroupAddrSpace=*/ static_cast(NVVM::NVVMMemorySpace::kSharedMemorySpace), StringAttr::get(&converter.getContext(), - NVVM::NVVMDialect::getKernelFuncAttrName())); + NVVM::NVVMDialect::getKernelFuncAttrName()), + StringAttr::get(&converter.getContext(), + NVVM::NVVMDialect::getMaxntidAttrName())); populateOpPatterns(converter, patterns, "__nv_fabsf", "__nv_fabs"); diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index 20a200e812c12..c7f1d4f124c18 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -627,6 +627,15 @@ gpu.module @test_module_31 { } } +gpu.module @gpumodule { +// CHECK-LABEL: func @kernel_with_block_size() +// CHECK: attributes {gpu.kernel, gpu.known_block_size = array, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]} + gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array} { + gpu.return + } +} + + module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) { %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module