diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index ee3632ba149e5..6263ea63cbf22 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -64,6 +64,59 @@ struct GPUToNVVMPipelineOptions llvm::cl::init(true)}; }; +/// Options for the gpu to rocdl pipeline. +struct GPUToROCDLPipelineOptions + : public PassPipelineOptions { + PassOptions::Option indexBitWidth{ + *this, "index-bitwidth", + llvm::cl::desc("Bitwidth of the index type for the host (warning this " + "should be 64 until the GPU layering is fixed)"), + llvm::cl::init(64)}; + PassOptions::Option triple{ + *this, "triple", + llvm::cl::desc("AMDGPU target triple (e.g. amdgcn-amd-amdhsa)."), + llvm::cl::init("amdgcn-amd-amdhsa")}; + PassOptions::Option chip{ + *this, "chip", + llvm::cl::desc( + "AMDGPU target chip (e.g. gfx90a, gfx942, gfx1100). Required: " + "AMDGCN binaries are not forward-compatible across chip families.")}; + PassOptions::Option features{ + *this, "features", llvm::cl::desc("AMDGPU target features."), + llvm::cl::init("")}; + PassOptions::Option binaryFormat{ + *this, "binary-format", + llvm::cl::desc("Final GPU binary emission format (e.g. fatbin, binary, " + "isa, llvm, offloading)."), + llvm::cl::init("fatbin")}; + PassOptions::Option abiVersion{ + *this, "abi", + llvm::cl::desc("AMDHSA ABI version (e.g. \"500\", \"600\")."), + llvm::cl::init("600")}; + PassOptions::Option wave64{ + *this, "wave64", + llvm::cl::desc("Use Wave64 mode (default true; wave32 if false, " + "appropriate for RDNA / gfx10+ where supported)."), + llvm::cl::init(true)}; + PassOptions::Option optLevel{ + *this, "opt-level", + llvm::cl::desc("Optimization level for ROCDL/AMDGPU compilation."), + llvm::cl::init(2)}; + PassOptions::Option cmdOptions{ + *this, "rocdl-cmd-options", + llvm::cl::desc( + "Command line options to pass to the downstream AMDGPU compiler."), + llvm::cl::init("")}; + PassOptions::Option kernelUseBarePtrCallConv{ + *this, "kernel-bare-ptr-calling-convention", + llvm::cl::desc("Use bareptr calling convention for device kernels."), + llvm::cl::init(false)}; + PassOptions::Option hostUseBarePtrCallConv{ + *this, "host-bare-ptr-calling-convention", + llvm::cl::desc("Use bareptr calling convention for the host."), + llvm::cl::init(false)}; +}; + // Options for the gpu to xevm pipeline. struct GPUToXeVMPipelineOptions : public PassPipelineOptions { @@ -120,6 +173,12 @@ struct GPUToXeVMPipelineOptions void buildLowerToNVVMPassPipeline(OpPassManager &pm, const GPUToNVVMPipelineOptions &options); +/// Adds the GPU to ROCDL pipeline to the given pass manager. Transforms main +/// dialects (arith, memref, scf, vector, gpu) into ROCDL/AMDGPU. Begins with +/// GPU code regions, then handles host code. +void buildLowerToROCDLPassPipeline(OpPassManager &pm, + const GPUToROCDLPipelineOptions &options); + /// Adds the GPU to XeVM pipeline to the given pass manager. Transforms main /// dialects into XeVM targets. Begins with GPU code regions, then handles host /// code. @@ -128,6 +187,7 @@ void buildLowerToXeVMPassPipeline(OpPassManager &pm, /// Register all pipelines for the `gpu` dialect. void registerGPUToNVVMPipeline(); +void registerGPUToROCDLPipeline(); void registerGPUToXeVMPipeline(); } // namespace gpu diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt index 85b7b1ce90637..f523ccaee3f9f 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_dialect_library(MLIRGPUPipelines GPUToNVVMPipeline.cpp + GPUToROCDLPipeline.cpp GPUToXeVMPipeline.cpp ADDITIONAL_HEADER_DIRS @@ -12,8 +13,12 @@ add_mlir_dialect_library(MLIRGPUPipelines MLIRTransforms MLIRLinalgTransforms MLIRAffineToStandard + MLIRAMDGPUToROCDL + MLIRArithToLLVM + MLIRFuncToLLVM MLIRGPUToLLVMSPV MLIRGPUToNVVMTransforms + MLIRGPUToROCDLTransforms MLIRIndexToLLVM MLIRMathToLLVM MLIRMathToXeVM diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp new file mode 100644 index 0000000000000..1e5fd09a00a75 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp @@ -0,0 +1,136 @@ +//===- GPUToROCDLPipeline.cpp - Lowering pipeline to ROCDL/AMDGPU --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a sink pipeline that lowers a payload containing +// `gpu.launch` / `gpu.module` ops to AMDGPU/ROCDL and emits an AMDGCN binary +// blob via `gpu-module-to-binary`. It is the AMD counterpart of +// `gpu-lower-to-nvvm-pipeline` and `gpu-lower-to-xevm-pipeline`. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" +#include "mlir/Conversion/GPUToROCDL/Runtimes.h" +#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" +#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" +#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/VectorToSCF/VectorToSCF.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Pipelines/Passes.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassOptions.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { + +//===----------------------------------------------------------------------===// +// Common pipeline +//===----------------------------------------------------------------------===// +void buildCommonPassPipeline( + OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) { + // Lower AMDGPU dialect ops (e.g. amdgpu.lds_barrier, amdgpu.dpp, + // amdgpu.mfma, amdgpu.dot, ...) to ROCDL intrinsics first, while they may + // still live in unout-lined `gpu.launch` bodies. Mirrors the way NVVM's + // pipeline runs `convert-nvgpu-to-nvvm` before kernel outlining. + ConvertAMDGPUToROCDLPassOptions amdgpuToROCDLOpt; + amdgpuToROCDLOpt.chipset = options.chip; + pm.addPass(createConvertAMDGPUToROCDLPass(amdgpuToROCDLOpt)); + + pm.addPass(createGpuKernelOutliningPass()); + pm.addPass(createConvertVectorToSCFPass()); + pm.addPass(createSCFToControlFlowPass()); + pm.addPass(createConvertFuncToLLVMPass()); + pm.addPass(memref::createExpandStridedMetadataPass()); + + GpuROCDLAttachTargetOptions rocdlTargetOptions; + rocdlTargetOptions.triple = options.triple; + rocdlTargetOptions.chip = options.chip; + rocdlTargetOptions.features = options.features; + rocdlTargetOptions.abiVersion = options.abiVersion; + rocdlTargetOptions.optLevel = options.optLevel; + rocdlTargetOptions.wave64Flag = options.wave64; + pm.addPass(createGpuROCDLAttachTarget(rocdlTargetOptions)); + + pm.addPass(createLowerAffinePass()); + pm.addPass(createArithToLLVMConversionPass()); + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt; + convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth; + pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); +} + +//===----------------------------------------------------------------------===// +// GPUModule-specific stuff. +//===----------------------------------------------------------------------===// +void buildGpuPassPipeline(OpPassManager &pm, + const mlir::gpu::GPUToROCDLPipelineOptions &options) { + ConvertGpuOpsToROCDLOpsOptions opt; + opt.chipset = options.chip; + opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv; + opt.indexBitwidth = options.indexBitWidth; + // Always declare HIP as the runtime so that gpu.printf etc. lower to the + // matching runtime entry points exposed by `libmlir_rocm_runtime.so`. + opt.runtime = mlir::gpu::amd::Runtime::HIP; + pm.addNestedPass(createConvertGpuOpsToROCDLOps(opt)); + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createCSEPass()); + pm.addNestedPass(createReconcileUnrealizedCastsPass()); +} + +//===----------------------------------------------------------------------===// +// Host Post-GPU pipeline +//===----------------------------------------------------------------------===// +void buildHostPostPipeline( + OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) { + GpuToLLVMConversionPassOptions opt; + opt.hostBarePtrCallConv = options.hostUseBarePtrCallConv; + opt.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv; + pm.addPass(createGpuToLLVMConversionPass(opt)); + + GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions; + gpuModuleToBinaryPassOptions.compilationTarget = options.binaryFormat; + gpuModuleToBinaryPassOptions.cmdOptions = options.cmdOptions; + pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions)); + pm.addPass(createConvertMathToLLVMPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(createReconcileUnrealizedCastsPass()); +} + +} // namespace + +void mlir::gpu::buildLowerToROCDLPassPipeline( + OpPassManager &pm, const GPUToROCDLPipelineOptions &options) { + // Common pipelines + buildCommonPassPipeline(pm, options); + + // GPUModule-specific stuff + buildGpuPassPipeline(pm, options); + + // Host post-GPUModule-specific stuff + buildHostPostPipeline(pm, options); +} + +void mlir::gpu::registerGPUToROCDLPipeline() { + PassPipelineRegistration( + "gpu-lower-to-rocdl-pipeline", + "The default pipeline lowers main dialects (arith, memref, scf, vector, " + "gpu) to ROCDL. It starts by lowering GPU code to the specified " + "compilation target (default is fatbin) then lowers the host code.", + buildLowerToROCDLPassPipeline); +} diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp index e1d5b1236c8a6..c645d737cb766 100644 --- a/mlir/lib/RegisterAllPasses.cpp +++ b/mlir/lib/RegisterAllPasses.cpp @@ -100,5 +100,6 @@ void mlir::registerAllPasses() { sparse_tensor::registerSparseTensorPipelines(); tosa::registerTosaToLinalgPipelines(); gpu::registerGPUToNVVMPipeline(); + gpu::registerGPUToROCDLPipeline(); gpu::registerGPUToXeVMPipeline(); } diff --git a/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir new file mode 100644 index 0000000000000..e4d2ad48644ae --- /dev/null +++ b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir @@ -0,0 +1,69 @@ +// RUN: mlir-opt %s \ +// RUN: --gpu-lower-to-rocdl-pipeline="chip=%chip" \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_rocm_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +// Mirror image of `vecadd.mlir`, but lowered through the +// `gpu-lower-to-rocdl-pipeline` meta-pass instead of the hand-rolled pass +// pipeline. Verifies that a single `--gpu-lower-to-rocdl-pipeline` invocation +// reproduces the same numeric output the multi-step recipe used to require. +// +// The kernel intentionally embeds an `amdgpu.sched_barrier` op (semantically a +// no-op, lowers to `rocdl.sched.barrier`) so that the test also exercises the +// `convert-amdgpu-to-rocdl` step that runs first inside the meta-pass. + +func.func @vecadd(%arg0 : memref<5xf32>, %arg1 : memref<5xf32>, %arg2 : memref<5xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %block_dim = arith.constant 5 : index + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) { + %a = memref.load %arg0[%tx] : memref<5xf32> + %b = memref.load %arg1[%tx] : memref<5xf32> + amdgpu.sched_barrier allow = + %c = arith.addf %a, %b : f32 + memref.store %c, %arg2[%tx] : memref<5xf32> + gpu.terminator + } + return +} + +// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46] +func.func @main() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c5 = arith.constant 5 : index + %cf1dot23 = arith.constant 1.23 : f32 + %0 = memref.alloc() : memref<5xf32> + %1 = memref.alloc() : memref<5xf32> + %2 = memref.alloc() : memref<5xf32> + %3 = memref.cast %0 : memref<5xf32> to memref + %4 = memref.cast %1 : memref<5xf32> to memref + %5 = memref.cast %2 : memref<5xf32> to memref + scf.for %i = %c0 to %c5 step %c1 { + memref.store %cf1dot23, %3[%i] : memref + memref.store %cf1dot23, %4[%i] : memref + } + %6 = memref.cast %3 : memref to memref<*xf32> + %7 = memref.cast %4 : memref to memref<*xf32> + %8 = memref.cast %5 : memref to memref<*xf32> + gpu.host_register %6 : memref<*xf32> + gpu.host_register %7 : memref<*xf32> + gpu.host_register %8 : memref<*xf32> + %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref) -> (memref) + %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref) -> (memref) + %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref) -> (memref) + %12 = memref.cast %9 : memref to memref<5xf32> + %13 = memref.cast %10 : memref to memref<5xf32> + %14 = memref.cast %11 : memref to memref<5xf32> + + call @vecadd(%12, %13, %14) : (memref<5xf32>, memref<5xf32>, memref<5xf32>) -> () + call @printMemrefF32(%8) : (memref<*xf32>) -> () + return +} + +func.func private @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref) -> (memref) +func.func private @printMemrefF32(%ptr : memref<*xf32>)