[MLIR][GPU] Add gpu-lower-to-rocdl-pipeline meta-pass#196751
Conversation
|
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir Author: Ivan Butygin (Hardcode84) ChangesAdd Full diff: https://github.com/llvm/llvm-project/pull/196751.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
index ee3632ba149e5..9a258ca30ec9e 100644
--- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
@@ -64,6 +64,58 @@ struct GPUToNVVMPipelineOptions
llvm::cl::init(true)};
};
+/// Options for the gpu to rocdl pipeline.
+struct GPUToROCDLPipelineOptions
+ : public PassPipelineOptions<GPUToROCDLPipelineOptions> {
+ PassOptions::Option<int64_t> indexBitWidth{
+ *this, "index-bitwidth",
+ llvm::cl::desc("Bitwidth of the index type for the host (warning this "
+ "should be 64 until the GPU layering is fixed)"),
+ llvm::cl::init(64)};
+ PassOptions::Option<std::string> hsacoTriple{
+ *this, "hsaco-triple",
+ llvm::cl::desc("Triple to use to serialize to hsaco."),
+ llvm::cl::init("amdgcn-amd-amdhsa")};
+ PassOptions::Option<std::string> hsacoChip{
+ *this, "hsaco-chip", llvm::cl::desc("Chip to use to serialize to hsaco."),
+ llvm::cl::init("gfx900")};
+ PassOptions::Option<std::string> hsacoFeatures{
+ *this, "hsaco-features",
+ llvm::cl::desc("Features to use to serialize to hsaco."),
+ llvm::cl::init("")};
+ PassOptions::Option<std::string> binaryFormat{
+ *this, "binary-format",
+ llvm::cl::desc("Final GPU binary emission format (e.g. fatbin, binary, "
+ "isa, llvm, offloading)."),
+ llvm::cl::init("fatbin")};
+ PassOptions::Option<std::string> hsacoAbiVersion{
+ *this, "hsaco-abi",
+ llvm::cl::desc("AMDHSA ABI version (e.g. \"500\", \"600\")."),
+ llvm::cl::init("600")};
+ PassOptions::Option<bool> hsacoWave64{
+ *this, "hsaco-wave64",
+ llvm::cl::desc("Use Wave64 mode (default true; wave32 if false, "
+ "appropriate for RDNA / gfx10+ where supported)."),
+ llvm::cl::init(true)};
+ PassOptions::Option<int> optLevel{
+ *this, "opt-level",
+ llvm::cl::desc("Optimization level for ROCDL/AMDGPU compilation."),
+ llvm::cl::init(2)};
+ PassOptions::Option<std::string> cmdOptions{
+ *this, "rocdl-cmd-options",
+ llvm::cl::desc(
+ "Command line options to pass to the downstream AMDGPU compiler."),
+ llvm::cl::init("")};
+ PassOptions::Option<bool> kernelUseBarePtrCallConv{
+ *this, "kernel-bare-ptr-calling-convention",
+ llvm::cl::desc("Use bareptr calling convention for device kernels."),
+ llvm::cl::init(false)};
+ PassOptions::Option<bool> hostUseBarePtrCallConv{
+ *this, "host-bare-ptr-calling-convention",
+ llvm::cl::desc("Use bareptr calling convention for the host."),
+ llvm::cl::init(false)};
+};
+
// Options for the gpu to xevm pipeline.
struct GPUToXeVMPipelineOptions
: public PassPipelineOptions<GPUToXeVMPipelineOptions> {
@@ -120,6 +172,12 @@ struct GPUToXeVMPipelineOptions
void buildLowerToNVVMPassPipeline(OpPassManager &pm,
const GPUToNVVMPipelineOptions &options);
+/// Adds the GPU to ROCDL pipeline to the given pass manager. Transforms main
+/// dialects (arith, memref, scf, vector, gpu) into ROCDL/AMDGPU. Begins with
+/// GPU code regions, then handles host code.
+void buildLowerToROCDLPassPipeline(OpPassManager &pm,
+ const GPUToROCDLPipelineOptions &options);
+
/// Adds the GPU to XeVM pipeline to the given pass manager. Transforms main
/// dialects into XeVM targets. Begins with GPU code regions, then handles host
/// code.
@@ -128,6 +186,7 @@ void buildLowerToXeVMPassPipeline(OpPassManager &pm,
/// Register all pipelines for the `gpu` dialect.
void registerGPUToNVVMPipeline();
+void registerGPUToROCDLPipeline();
void registerGPUToXeVMPipeline();
} // namespace gpu
diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
index 85b7b1ce90637..f523ccaee3f9f 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
@@ -1,5 +1,6 @@
add_mlir_dialect_library(MLIRGPUPipelines
GPUToNVVMPipeline.cpp
+ GPUToROCDLPipeline.cpp
GPUToXeVMPipeline.cpp
ADDITIONAL_HEADER_DIRS
@@ -12,8 +13,12 @@ add_mlir_dialect_library(MLIRGPUPipelines
MLIRTransforms
MLIRLinalgTransforms
MLIRAffineToStandard
+ MLIRAMDGPUToROCDL
+ MLIRArithToLLVM
+ MLIRFuncToLLVM
MLIRGPUToLLVMSPV
MLIRGPUToNVVMTransforms
+ MLIRGPUToROCDLTransforms
MLIRIndexToLLVM
MLIRMathToLLVM
MLIRMathToXeVM
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp
new file mode 100644
index 0000000000000..398cd59ec919e
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp
@@ -0,0 +1,136 @@
+//===- GPUToROCDLPipeline.cpp - Lowering pipeline to ROCDL/AMDGPU --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a sink pipeline that lowers a payload containing
+// `gpu.launch` / `gpu.module` ops to AMDGPU/ROCDL and emits an AMDGCN binary
+// blob via `gpu-module-to-binary`. It is the AMD counterpart of
+// `gpu-lower-to-nvvm-pipeline` and `gpu-lower-to-xevm-pipeline`.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+#include "mlir/Conversion/GPUToROCDL/Runtimes.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Pipelines/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Common pipeline
+//===----------------------------------------------------------------------===//
+void buildCommonPassPipeline(
+ OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) {
+ // Lower AMDGPU dialect ops (e.g. amdgpu.lds_barrier, amdgpu.dpp,
+ // amdgpu.mfma, amdgpu.dot, ...) to ROCDL intrinsics first, while they may
+ // still live in unout-lined `gpu.launch` bodies. Mirrors the way NVVM's
+ // pipeline runs `convert-nvgpu-to-nvvm` before kernel outlining.
+ ConvertAMDGPUToROCDLPassOptions amdgpuToROCDLOpt;
+ amdgpuToROCDLOpt.chipset = options.hsacoChip;
+ pm.addPass(createConvertAMDGPUToROCDLPass(amdgpuToROCDLOpt));
+
+ pm.addPass(createGpuKernelOutliningPass());
+ pm.addPass(createConvertVectorToSCFPass());
+ pm.addPass(createSCFToControlFlowPass());
+ pm.addPass(createConvertFuncToLLVMPass());
+ pm.addPass(memref::createExpandStridedMetadataPass());
+
+ GpuROCDLAttachTargetOptions rocdlTargetOptions;
+ rocdlTargetOptions.triple = options.hsacoTriple;
+ rocdlTargetOptions.chip = options.hsacoChip;
+ rocdlTargetOptions.features = options.hsacoFeatures;
+ rocdlTargetOptions.abiVersion = options.hsacoAbiVersion;
+ rocdlTargetOptions.optLevel = options.optLevel;
+ rocdlTargetOptions.wave64Flag = options.hsacoWave64;
+ pm.addPass(createGpuROCDLAttachTarget(rocdlTargetOptions));
+
+ pm.addPass(createLowerAffinePass());
+ pm.addPass(createArithToLLVMConversionPass());
+ ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
+ convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
+ pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
+ pm.addPass(createCanonicalizerPass());
+ pm.addPass(createCSEPass());
+}
+
+//===----------------------------------------------------------------------===//
+// GPUModule-specific stuff.
+//===----------------------------------------------------------------------===//
+void buildGpuPassPipeline(OpPassManager &pm,
+ const mlir::gpu::GPUToROCDLPipelineOptions &options) {
+ ConvertGpuOpsToROCDLOpsOptions opt;
+ opt.chipset = options.hsacoChip;
+ opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv;
+ opt.indexBitwidth = options.indexBitWidth;
+ // Always declare HIP as the runtime so that gpu.printf etc. lower to the
+ // matching runtime entry points exposed by `libmlir_rocm_runtime.so`.
+ opt.runtime = mlir::gpu::amd::Runtime::HIP;
+ pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToROCDLOps(opt));
+ pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+ pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+ pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
+
+//===----------------------------------------------------------------------===//
+// Host Post-GPU pipeline
+//===----------------------------------------------------------------------===//
+void buildHostPostPipeline(
+ OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) {
+ GpuToLLVMConversionPassOptions opt;
+ opt.hostBarePtrCallConv = options.hostUseBarePtrCallConv;
+ opt.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv;
+ pm.addPass(createGpuToLLVMConversionPass(opt));
+
+ GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+ gpuModuleToBinaryPassOptions.compilationTarget = options.binaryFormat;
+ gpuModuleToBinaryPassOptions.cmdOptions = options.cmdOptions;
+ pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+ pm.addPass(createConvertMathToLLVMPass());
+ pm.addPass(createCanonicalizerPass());
+ pm.addPass(createCSEPass());
+ pm.addPass(createReconcileUnrealizedCastsPass());
+}
+
+} // namespace
+
+void mlir::gpu::buildLowerToROCDLPassPipeline(
+ OpPassManager &pm, const GPUToROCDLPipelineOptions &options) {
+ // Common pipelines
+ buildCommonPassPipeline(pm, options);
+
+ // GPUModule-specific stuff
+ buildGpuPassPipeline(pm, options);
+
+ // Host post-GPUModule-specific stuff
+ buildHostPostPipeline(pm, options);
+}
+
+void mlir::gpu::registerGPUToROCDLPipeline() {
+ PassPipelineRegistration<GPUToROCDLPipelineOptions>(
+ "gpu-lower-to-rocdl-pipeline",
+ "The default pipeline lowers main dialects (arith, memref, scf, vector, "
+ "gpu) to ROCDL. It starts by lowering GPU code to the specified "
+ "compilation target (default is fatbin) then lowers the host code.",
+ buildLowerToROCDLPassPipeline);
+}
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index e1d5b1236c8a6..c645d737cb766 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -100,5 +100,6 @@ void mlir::registerAllPasses() {
sparse_tensor::registerSparseTensorPipelines();
tosa::registerTosaToLinalgPipelines();
gpu::registerGPUToNVVMPipeline();
+ gpu::registerGPUToROCDLPipeline();
gpu::registerGPUToXeVMPipeline();
}
diff --git a/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir
new file mode 100644
index 0000000000000..8e1cd6371a677
--- /dev/null
+++ b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-opt %s \
+// RUN: --gpu-lower-to-rocdl-pipeline="hsaco-chip=%chip" \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_rocm_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+// Mirror image of `vecadd.mlir`, but lowered through the
+// `gpu-lower-to-rocdl-pipeline` meta-pass instead of the hand-rolled pass
+// pipeline. Verifies that a single `--gpu-lower-to-rocdl-pipeline` invocation
+// reproduces the same numeric output the multi-step recipe used to require.
+//
+// The kernel intentionally embeds an `amdgpu.sched_barrier` op (semantically a
+// no-op, lowers to `rocdl.sched.barrier`) so that the test also exercises the
+// `convert-amdgpu-to-rocdl` step that runs first inside the meta-pass.
+
+func.func @vecadd(%arg0 : memref<5xf32>, %arg1 : memref<5xf32>, %arg2 : memref<5xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %block_dim = arith.constant 5 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+ threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) {
+ %a = memref.load %arg0[%tx] : memref<5xf32>
+ %b = memref.load %arg1[%tx] : memref<5xf32>
+ amdgpu.sched_barrier allow = <none>
+ %c = arith.addf %a, %b : f32
+ memref.store %c, %arg2[%tx] : memref<5xf32>
+ gpu.terminator
+ }
+ return
+}
+
+// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46]
+func.func @main() {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c5 = arith.constant 5 : index
+ %cf1dot23 = arith.constant 1.23 : f32
+ %0 = memref.alloc() : memref<5xf32>
+ %1 = memref.alloc() : memref<5xf32>
+ %2 = memref.alloc() : memref<5xf32>
+ %3 = memref.cast %0 : memref<5xf32> to memref<?xf32>
+ %4 = memref.cast %1 : memref<5xf32> to memref<?xf32>
+ %5 = memref.cast %2 : memref<5xf32> to memref<?xf32>
+ scf.for %i = %c0 to %c5 step %c1 {
+ memref.store %cf1dot23, %3[%i] : memref<?xf32>
+ memref.store %cf1dot23, %4[%i] : memref<?xf32>
+ }
+ %6 = memref.cast %3 : memref<?xf32> to memref<*xf32>
+ %7 = memref.cast %4 : memref<?xf32> to memref<*xf32>
+ %8 = memref.cast %5 : memref<?xf32> to memref<*xf32>
+ gpu.host_register %6 : memref<*xf32>
+ gpu.host_register %7 : memref<*xf32>
+ gpu.host_register %8 : memref<*xf32>
+ %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref<?xf32>) -> (memref<?xf32>)
+ %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref<?xf32>) -> (memref<?xf32>)
+ %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref<?xf32>) -> (memref<?xf32>)
+ %12 = memref.cast %9 : memref<?xf32> to memref<5xf32>
+ %13 = memref.cast %10 : memref<?xf32> to memref<5xf32>
+ %14 = memref.cast %11 : memref<?xf32> to memref<5xf32>
+
+ call @vecadd(%12, %13, %14) : (memref<5xf32>, memref<5xf32>, memref<5xf32>) -> ()
+ call @printMemrefF32(%8) : (memref<*xf32>) -> ()
+ return
+}
+
+func.func private @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
+func.func private @printMemrefF32(%ptr : memref<*xf32>)
|
| "should be 64 until the GPU layering is fixed)"), | ||
| llvm::cl::init(64)}; | ||
| PassOptions::Option<std::string> hsacoTriple{ | ||
| *this, "hsaco-triple", |
There was a problem hiding this comment.
Why the hsaco prefix? It looks unnecessary.
There was a problem hiding this comment.
dropped the prefix
| llvm::cl::init("amdgcn-amd-amdhsa")}; | ||
| PassOptions::Option<std::string> hsacoChip{ | ||
| *this, "hsaco-chip", llvm::cl::desc("Chip to use to serialize to hsaco."), | ||
| llvm::cl::init("gfx900")}; |
There was a problem hiding this comment.
I know the target attr uses this as default, but I'm not sure it makes to have it. In the NVVM pipeline makes sense because ptx, but I think this is a footgun for new users to AMDGPUs, so I'd propose to remove it.
There was a problem hiding this comment.
removed the default value, it's now explicitly required here. FYI, underlying passes are still have their own default values if invoked separately.
Add an AMDGPU counterpart to gpu-lower-to-nvvm-pipeline and
gpu-lower-to-xevm-pipeline. The new pass takes a payload that contains
gpu.launch / gpu.module ops (optionally with AMDGPU dialect ops embedded in
the device code) all the way to an AMDGCN binary embedded in a gpu.binary
op, ready to be launched via libmlir_rocm_runtime.so.
Until now ROCDL users had to hand-compose ~10 passes (convert-amdgpu-to-rocdl,
gpu-kernel-outlining, convert-vector-to-scf, convert-scf-to-cf,
convert-func-to-llvm, expand-strided-metadata, rocdl-attach-target,
lower-affine, convert-arith-to-llvm, convert-index-to-llvm,
gpu.module(convert-gpu-to-rocdl, canonicalize, cse,
reconcile-unrealized-casts), gpu-to-llvm, gpu-module-to-binary,
convert-math-to-llvm, reconcile-unrealized-casts) to do what NVVM and XeVM
users get from a single pipeline option. This patch reuses the same
three-phase structure as GPUToNVVMPipeline.cpp (pre-GPU common,
gpu.module-nested, host-post) and runs convert-amdgpu-to-rocdl as the
first step (mirroring NVVM's placement of convert-nvgpu-to-nvvm), so
that AMDGPU dialect ops (amdgpu.dpp, amdgpu.mfma, amdgpu.dot,
amdgpu.sched_barrier, amdgpu.lds_barrier, amdgpu.raw_buffer_*,
amdgpu.fat_raw_buffer_cast, ...) lower correctly without any extra setup.
The pipeline exposes the AMDGPU-codegen knobs needed at the boundary
(triple, chip, features, abi, wave64, opt-level, rocdl-cmd-options,
{kernel,host}-bare-ptr-calling-convention, index-bitwidth) and threads
chip through both rocdl-attach-target and convert-amdgpu-to-rocdl as
the chipset. chip has no default: AMDGCN binaries are not forward-
compatible across chip families, so silently codegen'ing for some hard
coded chip would be a footgun; passing nothing produces a clear
"Invalid chipset name" diagnostic from convert-amdgpu-to-rocdl. The
output container is selected via binary-format (matching the XeVM
pipeline's naming, since the option is vendor-agnostic and accepts
fatbin/binary/isa/llvm/offloading rather than only hsaco).
Hooked into mlir::registerAllPasses(). Tested end-to-end on a Radeon Pro
W7900 (gfx1100): the new lit test under test/Integration/GPU/ROCM/
runs the same vecadd payload as the existing vecadd.mlir but lowers it
through a single --gpu-lower-to-rocdl-pipeline=chip=%chip invocation.
The kernel embeds an amdgpu.sched_barrier op so the test also exercises
the convert-amdgpu-to-rocdl step inside the meta-pass. All six tests
in test/Integration/GPU/ROCM/ pass.
f215e30 to
037caaa
Compare
Add `gpu-lower-to-rocdl-pipeline` meta-pass which lowers common MLIR dialects (gpu/arith/scf/vector) to binary, similar to the existing XeVM/NVVM pipelines.
Add `gpu-lower-to-rocdl-pipeline` meta-pass which lowers common MLIR dialects (gpu/arith/scf/vector) to binary, similar to the existing XeVM/NVVM pipelines.
Add
gpu-lower-to-rocdl-pipelinemeta-pass which lowers common MLIR dialects (gpu/arith/scf/vector) to binary, similar to the existing XeVM/NVVM pipelines.