diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
index ee3632ba149e5..6263ea63cbf22 100644
--- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
@@ -64,6 +64,59 @@ struct GPUToNVVMPipelineOptions
       llvm::cl::init(true)};
 };
 
+/// Options for the gpu to rocdl pipeline.
+struct GPUToROCDLPipelineOptions
+    : public PassPipelineOptions<GPUToROCDLPipelineOptions> {
+  PassOptions::Option<int64_t> indexBitWidth{
+      *this, "index-bitwidth",
+      llvm::cl::desc("Bitwidth of the index type for the host (warning this "
+                     "should be 64 until the GPU layering is fixed)"),
+      llvm::cl::init(64)};
+  PassOptions::Option<std::string> triple{
+      *this, "triple",
+      llvm::cl::desc("AMDGPU target triple (e.g. amdgcn-amd-amdhsa)."),
+      llvm::cl::init("amdgcn-amd-amdhsa")};
+  PassOptions::Option<std::string> chip{
+      *this, "chip",
+      llvm::cl::desc(
+          "AMDGPU target chip (e.g. gfx90a, gfx942, gfx1100). Required: "
+          "AMDGCN binaries are not forward-compatible across chip families.")};
+  PassOptions::Option<std::string> features{
+      *this, "features", llvm::cl::desc("AMDGPU target features."),
+      llvm::cl::init("")};
+  PassOptions::Option<std::string> binaryFormat{
+      *this, "binary-format",
+      llvm::cl::desc("Final GPU binary emission format (e.g. fatbin, binary, "
+                     "isa, llvm, offloading)."),
+      llvm::cl::init("fatbin")};
+  PassOptions::Option<std::string> abiVersion{
+      *this, "abi",
+      llvm::cl::desc("AMDHSA ABI version (e.g. \"500\", \"600\")."),
+      llvm::cl::init("600")};
+  PassOptions::Option<bool> wave64{
+      *this, "wave64",
+      llvm::cl::desc("Use Wave64 mode (default true; wave32 if false, "
+                     "appropriate for RDNA / gfx10+ where supported)."),
+      llvm::cl::init(true)};
+  PassOptions::Option<int> optLevel{
+      *this, "opt-level",
+      llvm::cl::desc("Optimization level for ROCDL/AMDGPU compilation."),
+      llvm::cl::init(2)};
+  PassOptions::Option<std::string> cmdOptions{
+      *this, "rocdl-cmd-options",
+      llvm::cl::desc(
+          "Command line options to pass to the downstream AMDGPU compiler."),
+      llvm::cl::init("")};
+  PassOptions::Option<bool> kernelUseBarePtrCallConv{
+      *this, "kernel-bare-ptr-calling-convention",
+      llvm::cl::desc("Use bareptr calling convention for device kernels."),
+      llvm::cl::init(false)};
+  PassOptions::Option<bool> hostUseBarePtrCallConv{
+      *this, "host-bare-ptr-calling-convention",
+      llvm::cl::desc("Use bareptr calling convention for the host."),
+      llvm::cl::init(false)};
+};
+
 // Options for the gpu to xevm pipeline.
 struct GPUToXeVMPipelineOptions
     : public PassPipelineOptions<GPUToXeVMPipelineOptions> {
@@ -120,6 +173,12 @@ struct GPUToXeVMPipelineOptions
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const GPUToNVVMPipelineOptions &options);
 
+/// Adds the GPU to ROCDL pipeline to the given pass manager. Transforms main
+/// dialects (arith, memref, scf, vector, gpu) into ROCDL/AMDGPU. Begins with
+/// GPU code regions, then handles host code.
+void buildLowerToROCDLPassPipeline(OpPassManager &pm,
+                                   const GPUToROCDLPipelineOptions &options);
+
 /// Adds the GPU to XeVM pipeline to the given pass manager. Transforms main
 /// dialects into XeVM targets. Begins with GPU code regions, then handles host
 /// code.
@@ -128,6 +187,7 @@ void buildLowerToXeVMPassPipeline(OpPassManager &pm,
 
 /// Register all pipelines for the `gpu` dialect.
 void registerGPUToNVVMPipeline();
+void registerGPUToROCDLPipeline();
 void registerGPUToXeVMPipeline();
 
 } // namespace gpu
diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
index 85b7b1ce90637..f523ccaee3f9f 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRGPUPipelines
   GPUToNVVMPipeline.cpp
+  GPUToROCDLPipeline.cpp
   GPUToXeVMPipeline.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -12,8 +13,12 @@ add_mlir_dialect_library(MLIRGPUPipelines
   MLIRTransforms
   MLIRLinalgTransforms
   MLIRAffineToStandard
+  MLIRAMDGPUToROCDL
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
   MLIRGPUToLLVMSPV
   MLIRGPUToNVVMTransforms
+  MLIRGPUToROCDLTransforms
   MLIRIndexToLLVM
   MLIRMathToLLVM
   MLIRMathToXeVM
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp
new file mode 100644
index 0000000000000..1e5fd09a00a75
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToROCDLPipeline.cpp
@@ -0,0 +1,136 @@
+//===- GPUToROCDLPipeline.cpp - Lowering pipeline to ROCDL/AMDGPU --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a sink pipeline that lowers a payload containing
+// `gpu.launch` / `gpu.module` ops to AMDGPU/ROCDL and emits an AMDGCN binary
+// blob via `gpu-module-to-binary`. It is the AMD counterpart of
+// `gpu-lower-to-nvvm-pipeline` and `gpu-lower-to-xevm-pipeline`.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+#include "mlir/Conversion/GPUToROCDL/Runtimes.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Pipelines/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Common pipeline
+//===----------------------------------------------------------------------===//
+void buildCommonPassPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) {
+  // Lower AMDGPU dialect ops (e.g. amdgpu.lds_barrier, amdgpu.dpp,
+  // amdgpu.mfma, amdgpu.dot, ...) to ROCDL intrinsics first, while they may
+  // still live in unout-lined `gpu.launch` bodies. Mirrors the way NVVM's
+  // pipeline runs `convert-nvgpu-to-nvvm` before kernel outlining.
+  ConvertAMDGPUToROCDLPassOptions amdgpuToROCDLOpt;
+  amdgpuToROCDLOpt.chipset = options.chip;
+  pm.addPass(createConvertAMDGPUToROCDLPass(amdgpuToROCDLOpt));
+
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  pm.addPass(createSCFToControlFlowPass());
+  pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+
+  GpuROCDLAttachTargetOptions rocdlTargetOptions;
+  rocdlTargetOptions.triple = options.triple;
+  rocdlTargetOptions.chip = options.chip;
+  rocdlTargetOptions.features = options.features;
+  rocdlTargetOptions.abiVersion = options.abiVersion;
+  rocdlTargetOptions.optLevel = options.optLevel;
+  rocdlTargetOptions.wave64Flag = options.wave64;
+  pm.addPass(createGpuROCDLAttachTarget(rocdlTargetOptions));
+
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createArithToLLVMConversionPass());
+  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
+  convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
+  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+}
+
+//===----------------------------------------------------------------------===//
+// GPUModule-specific stuff.
+//===----------------------------------------------------------------------===//
+void buildGpuPassPipeline(OpPassManager &pm,
+                          const mlir::gpu::GPUToROCDLPipelineOptions &options) {
+  ConvertGpuOpsToROCDLOpsOptions opt;
+  opt.chipset = options.chip;
+  opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv;
+  opt.indexBitwidth = options.indexBitWidth;
+  // Always declare HIP as the runtime so that gpu.printf etc. lower to the
+  // matching runtime entry points exposed by `libmlir_rocm_runtime.so`.
+  opt.runtime = mlir::gpu::amd::Runtime::HIP;
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToROCDLOps(opt));
+  pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
+
+//===----------------------------------------------------------------------===//
+// Host Post-GPU pipeline
+//===----------------------------------------------------------------------===//
+void buildHostPostPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToROCDLPipelineOptions &options) {
+  GpuToLLVMConversionPassOptions opt;
+  opt.hostBarePtrCallConv = options.hostUseBarePtrCallConv;
+  opt.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv;
+  pm.addPass(createGpuToLLVMConversionPass(opt));
+
+  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+  gpuModuleToBinaryPassOptions.compilationTarget = options.binaryFormat;
+  gpuModuleToBinaryPassOptions.cmdOptions = options.cmdOptions;
+  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
+  pm.addPass(createConvertMathToLLVMPass());
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+}
+
+} // namespace
+
+void mlir::gpu::buildLowerToROCDLPassPipeline(
+    OpPassManager &pm, const GPUToROCDLPipelineOptions &options) {
+  // Common pipelines
+  buildCommonPassPipeline(pm, options);
+
+  // GPUModule-specific stuff
+  buildGpuPassPipeline(pm, options);
+
+  // Host post-GPUModule-specific stuff
+  buildHostPostPipeline(pm, options);
+}
+
+void mlir::gpu::registerGPUToROCDLPipeline() {
+  PassPipelineRegistration<GPUToROCDLPipelineOptions>(
+      "gpu-lower-to-rocdl-pipeline",
+      "The default pipeline lowers main dialects (arith, memref, scf, vector, "
+      "gpu) to ROCDL. It starts by lowering GPU code to the specified "
+      "compilation target (default is fatbin) then lowers the host code.",
+      buildLowerToROCDLPassPipeline);
+}
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index e1d5b1236c8a6..c645d737cb766 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -100,5 +100,6 @@ void mlir::registerAllPasses() {
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
   gpu::registerGPUToNVVMPipeline();
+  gpu::registerGPUToROCDLPipeline();
   gpu::registerGPUToXeVMPipeline();
 }
diff --git a/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir
new file mode 100644
index 0000000000000..e4d2ad48644ae
--- /dev/null
+++ b/mlir/test/Integration/GPU/ROCM/gpu-lower-to-rocdl-pipeline.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-opt %s \
+// RUN:   --gpu-lower-to-rocdl-pipeline="chip=%chip" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_rocm_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+// Mirror image of `vecadd.mlir`, but lowered through the
+// `gpu-lower-to-rocdl-pipeline` meta-pass instead of the hand-rolled pass
+// pipeline. Verifies that a single `--gpu-lower-to-rocdl-pipeline` invocation
+// reproduces the same numeric output the multi-step recipe used to require.
+//
+// The kernel intentionally embeds an `amdgpu.sched_barrier` op (semantically a
+// no-op, lowers to `rocdl.sched.barrier`) so that the test also exercises the
+// `convert-amdgpu-to-rocdl` step that runs first inside the meta-pass.
+
+func.func @vecadd(%arg0 : memref<5xf32>, %arg1 : memref<5xf32>, %arg2 : memref<5xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %block_dim = arith.constant 5 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) {
+    %a = memref.load %arg0[%tx] : memref<5xf32>
+    %b = memref.load %arg1[%tx] : memref<5xf32>
+    amdgpu.sched_barrier allow = <none>
+    %c = arith.addf %a, %b : f32
+    memref.store %c, %arg2[%tx] : memref<5xf32>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK: [2.46, 2.46, 2.46, 2.46, 2.46]
+func.func @main() {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  %cf1dot23 = arith.constant 1.23 : f32
+  %0 = memref.alloc() : memref<5xf32>
+  %1 = memref.alloc() : memref<5xf32>
+  %2 = memref.alloc() : memref<5xf32>
+  %3 = memref.cast %0 : memref<5xf32> to memref<?xf32>
+  %4 = memref.cast %1 : memref<5xf32> to memref<?xf32>
+  %5 = memref.cast %2 : memref<5xf32> to memref<?xf32>
+  scf.for %i = %c0 to %c5 step %c1 {
+    memref.store %cf1dot23, %3[%i] : memref<?xf32>
+    memref.store %cf1dot23, %4[%i] : memref<?xf32>
+  }
+  %6 = memref.cast %3 : memref<?xf32> to memref<*xf32>
+  %7 = memref.cast %4 : memref<?xf32> to memref<*xf32>
+  %8 = memref.cast %5 : memref<?xf32> to memref<*xf32>
+  gpu.host_register %6 : memref<*xf32>
+  gpu.host_register %7 : memref<*xf32>
+  gpu.host_register %8 : memref<*xf32>
+  %9 = call @mgpuMemGetDeviceMemRef1dFloat(%3) : (memref<?xf32>) -> (memref<?xf32>)
+  %10 = call @mgpuMemGetDeviceMemRef1dFloat(%4) : (memref<?xf32>) -> (memref<?xf32>)
+  %11 = call @mgpuMemGetDeviceMemRef1dFloat(%5) : (memref<?xf32>) -> (memref<?xf32>)
+  %12 = memref.cast %9 : memref<?xf32> to memref<5xf32>
+  %13 = memref.cast %10 : memref<?xf32> to memref<5xf32>
+  %14 = memref.cast %11 : memref<?xf32> to memref<5xf32>
+
+  call @vecadd(%12, %13, %14) : (memref<5xf32>, memref<5xf32>, memref<5xf32>) -> ()
+  call @printMemrefF32(%8) : (memref<*xf32>) -> ()
+  return
+}
+
+func.func private @mgpuMemGetDeviceMemRef1dFloat(%ptr : memref<?xf32>) -> (memref<?xf32>)
+func.func private @printMemrefF32(%ptr : memref<*xf32>)