From 00af43ad8856180af20c8f4867efbe0fa4db1b22 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Mon, 15 Sep 2025 14:43:36 +0100 Subject: [PATCH] [mlir][gpu][vector] Lower Vector dialect to GPU for element-wise ops only Current convertVectorToMMAOps starts from vector.contract and finds its dependencies as the targets to convert. In GPU dialect, we have gpu.subgroup_mma_elementwise operation. We should be able to lower element-wise operations to GPU MMA operations without vector.contract. This patch adds this case to the pattern. --- .../Conversion/VectorToGPU/VectorToGPU.cpp | 9 ++++++--- .../VectorToGPU/vector-to-mma-ops.mlir | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index 1d1904f717335..e373e9fe63500 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -355,11 +355,14 @@ static SetVector getOpToConvert(mlir::Operation *op, forwardSliceOptions.filter = hasVectorSrc; SetVector opToConvert; - op->walk([&](vector::ContractionOp contract) { - if (opToConvert.contains(contract.getOperation())) + op->walk([&](Operation *nestedOp) { + if (!isa(nestedOp) && + !elementwiseSupportsMMAMatrixType(nestedOp)) + return; + if (opToConvert.contains(nestedOp)) return; SetVector dependentOps = - getSliceContract(contract, backwardSliceOptions, forwardSliceOptions); + getSliceContract(nestedOp, backwardSliceOptions, forwardSliceOptions); // If any instruction cannot use MMA matrix type drop the whole // chain. MMA matrix are stored in an opaque type so they cannot be used // by all operations. diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir index b8ac63f89af33..ef72901750479 100644 --- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir +++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir @@ -536,3 +536,22 @@ func.func @test_unsupported(%arg0: vector<4x4xi32>, %arg1: vector<4x4xi32>, %arg %0, %1, %arg2 : vector<4x4xi64>, vector<4x4xi64> into vector<4x4xi64> return %2 : vector<4x4xi64> } + +// ----- + +#map0 = affine_map<(d0, d1) -> (d1, d0)> + +// CHECK-LABEL: func @addf +// CHECK: %[[A:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: %[[B:.+]] = gpu.subgroup_mma_load_matrix {{.+}} {leadDimension = 16 : index, transpose} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: %[[C:.+]] = gpu.subgroup_mma_elementwise addf %[[A]], %[[B]] : (!gpu.mma_matrix<16x16xf16, "COp">, !gpu.mma_matrix<16x16xf16, "COp">) -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: gpu.subgroup_mma_store_matrix %[[C]] +func.func @addf(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<16x16xf16>) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> + %B = vector.transfer_read %arg1[%c0, %c0], %cst {permutation_map = #map0, in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> + %C = arith.addf %A, %B : vector<16x16xf16> + vector.transfer_write %C, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16> + return +}