From e7df8d7912db559f733601e5d6a1d4728d2357f0 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 18 Nov 2025 03:12:44 +0000 Subject: [PATCH] [mlir][NVVM] Add no-rollback option to NVVM lowering passes --- mlir/include/mlir/Conversion/Passes.td | 2 ++ mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h | 4 ++++ mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 5 ++++- mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp | 1 + mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir | 1 + mlir/test/Conversion/GPUToNVVM/memref.mlir | 1 + mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir | 1 + mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir | 2 +- mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir | 2 +- mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir | 2 +- mlir/test/Integration/GPU/CUDA/assert.mlir | 2 +- mlir/test/Integration/GPU/CUDA/command-line-arg.mlir | 2 +- mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir | 2 +- mlir/test/Integration/GPU/CUDA/dump-ptx.mlir | 2 +- mlir/test/Integration/GPU/CUDA/dump-sass.mlir | 2 +- mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir | 2 +- mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir | 2 +- mlir/test/Integration/GPU/CUDA/printf.mlir | 2 +- mlir/test/Integration/GPU/CUDA/shuffle.mlir | 2 +- mlir/test/Integration/GPU/CUDA/two-modules.mlir | 2 +- 25 files changed, 32 insertions(+), 19 deletions(-) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 79bc380dbcb7a..0164a2fb9fa81 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -628,6 +628,8 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { /*default=*/"false", "Replace memref arguments in GPU functions with bare pointers. " "All memrefs must have static shape.">, + Option<"allowPatternRollback", "allow-pattern-rollback", "bool", "true", + "Experimental performance flag to disallow pattern rollback">, ListOption<"allowedDialects", "allowed-dialects", "std::string", "Run conversion patterns of only the specified dialects">, ]; diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index fccb49d49da70..34c85de3418ec 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -58,6 +58,10 @@ struct GPUToNVVMPipelineOptions "Whether to use the bareptr calling convention on the host (warning " "this should be false until the GPU layering is fixed)"), llvm::cl::init(false)}; + PassOptions::Option allowPatternRollback{ + *this, "allow-pattern-rollback", + llvm::cl::desc("Allow pattern rollback during dialect conversion"), + llvm::cl::init(true)}; }; // Options for the gpu to xevm pipeline. diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index d64c4d64cad84..5848489274c13 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -419,7 +419,10 @@ struct LowerGpuOpsToNVVMOpsPass final if (this->hasRedux) populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns); configureGpuToNVVMConversionLegality(target); - if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) + ConversionConfig config; + config.allowPatternRollback = allowPatternRollback; + if (failed( + applyPartialConversion(m, target, std::move(llvmPatterns), config))) signalPassFailure(); } }; diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp index 2c3e4661d266a..5462cddd44718 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp @@ -72,6 +72,7 @@ void buildGpuPassPipeline(OpPassManager &pm, ConvertGpuOpsToNVVMOpsOptions opt; opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv; opt.indexBitwidth = options.indexBitWidth; + opt.allowPatternRollback = options.allowPatternRollback; pm.addNestedPass(createConvertGpuOpsToNVVMOps(opt)); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir index a4b5dde8a2187..f1cc1eb983267 100644 --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allow-pattern-rollback=0' -split-input-file | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 allowed-dialects=func,arith,cf' -split-input-file | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-bare-ptr-memref-call-conv=1' -split-input-file | FileCheck %s --check-prefix=CHECK-BARE // RUN: mlir-opt %s -transform-interpreter | FileCheck %s diff --git a/mlir/test/Conversion/GPUToNVVM/memref.mlir b/mlir/test/Conversion/GPUToNVVM/memref.mlir index e164ca9103dee..a4e8ead344114 100644 --- a/mlir/test/Conversion/GPUToNVVM/memref.mlir +++ b/mlir/test/Conversion/GPUToNVVM/memref.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm="allow-pattern-rollback=0" | FileCheck %s // RUN: mlir-opt %s -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' \ // RUN: | FileCheck %s --check-prefix=BARE diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir index b479467efc208..82c02c1d6ee63 100644 --- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s +// RUN: mlir-opt --convert-gpu-to-nvvm="allow-pattern-rollback=0" --split-input-file %s | FileCheck %s // RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s gpu.module @test_module { diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index 5585d98c25b82..d0001f6ffc376 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir index cd90ce3ba2f1a..fcff5f40a6cc7 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir index fec2567f47f15..4718ac94fa0f2 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index d5633b00313b3..5e3a7e7e7d729 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index db297b0fc27b7..f1a48ae0c19c5 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 65cbc79752177..f0a46cea7ceb9 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index a0c955e4b570c..ddbabd4ddf960 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir index f041df82b4325..5c56e2ddfbd51 100644 --- a/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir +++ b/mlir/test/Integration/GPU/CUDA/alloc-host-shared.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir index 71a21cf4bd620..83cf70cd17078 100644 --- a/mlir/test/Integration/GPU/CUDA/assert.mlir +++ b/mlir/test/Integration/GPU/CUDA/assert.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir index 34dde6e03c80e..77a4fa089b62d 100644 --- a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir +++ b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" -debug-only=serialize-to-binary \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8' allow-pattern-rollback=0" -debug-only=serialize-to-binary \ // RUN: 2>&1 | FileCheck %s func.func @host_function(%arg0 : f32, %arg1 : memref) { diff --git a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir index ed01416d9523a..51f6e36aaa977 100644 --- a/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir +++ b/mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir @@ -2,7 +2,7 @@ // increment a global atomic counter and wait for the counter to reach 2. // // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | env CUDA_MODULE_LOADING=EAGER mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir index 27ec1ec435fef..efffcaaf23b2e 100644 --- a/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/GPU/CUDA/dump-ptx.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=serialize-to-isa \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s // CHECK-LABEL: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir index d32f5efc29d58..f810678569615 100644 --- a/mlir/test/Integration/GPU/CUDA/dump-sass.mlir +++ b/mlir/test/Integration/GPU/CUDA/dump-sass.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=dump-sass \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="allow-pattern-rollback=0" -debug-only=dump-sass \ // RUN: 2>&1 | FileCheck %s // CHECK: MOV diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index 07f3218ae89b2..fe3c2b1d93a1b 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index b2ac90acde94f..f8f1aa8aaa42e 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index fd664f2331488..ef116760b69e5 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index a6207d64c038b..a4be5223cd792 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index c3cee2fda46f3..3490003d6ba19 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format allow-pattern-rollback=0" \ // RUN: | mlir-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \