From f1db4aec3083e9388e2b8f38263a5a2f04a9bc02 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 15 Dec 2022 19:34:14 +0000 Subject: [PATCH] [mlir][VectorToGPU] Support transposed+broadcasted 2D MMA load This is loading from 2-D memref, in addition to D139655 where we load from 1-D memref cases. Reviewed By: ThomasRaoux Differential Revision: https://reviews.llvm.org/D140136 --- .../Conversion/VectorToGPU/VectorToGPU.cpp | 10 +++++---- .../VectorToGPU/vector-to-mma-ops.mlir | 22 +++++++++++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index 836e82ed44412..c0d093b843983 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -95,18 +95,20 @@ static bool contractSupportsMMAMatrixType(vector::ContractionOp contract, // Return true if the given map represents a transposed matrix load, // i.e. (d0, d1, ...) -> (dn-1, dn-2). static bool isTransposeMatrixLoadMap(OpBuilder &b, AffineMap permutationMap) { + MLIRContext *ctx = b.getContext(); auto nDim = permutationMap.getNumDims(); + AffineExpr zero = b.getAffineConstantExpr(0); if (nDim < 2) { // Support transposed+broadcasted cases: affine_map<(d0) -> (d0, 0)>. AffineExpr dim0 = b.getAffineDimExpr(0); - AffineExpr zero = b.getAffineConstantExpr(0); - return permutationMap == AffineMap::get(1, 0, {dim0, zero}, b.getContext()); + return permutationMap == AffineMap::get(1, 0, {dim0, zero}, ctx); } AffineExpr innerDim = b.getAffineDimExpr(nDim - 1); AffineExpr outerDim = b.getAffineDimExpr(nDim - 2); - return permutationMap == - AffineMap::get(nDim, 0, {innerDim, outerDim}, b.getContext()); + // Support both transposed and transposed+broadcasted cases. + return permutationMap == AffineMap::get(nDim, 0, {innerDim, outerDim}, ctx) || + permutationMap == AffineMap::get(nDim, 0, {innerDim, zero}, ctx); } // Return the stide for the dimension 0 of |type| if it is a memref and has a diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir index 56a8599095820..b00d34f23832c 100644 --- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir +++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir @@ -190,13 +190,13 @@ func.func @matmul_transposed(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>, return } -// CHECK-LABEL: func @matmul_transposed_broadcasted +// CHECK-LABEL: func @matmul_transposed_broadcasted_1d // CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index, transpose} : memref<16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> // CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index} : memref<16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> // CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> // CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp"> // CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16> -func.func @matmul_transposed_broadcasted(%arg0: memref<16xf16>, %arg1: memref<16xf16>, %arg2: memref<16x16xf16>) { +func.func @matmul_transposed_broadcasted_1d(%arg0: memref<16xf16>, %arg1: memref<16xf16>, %arg2: memref<16x16xf16>) { %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16> %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 @@ -207,3 +207,21 @@ func.func @matmul_transposed_broadcasted(%arg0: memref<16xf16>, %arg1: memref<16 vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16> return } + +// CHECK-LABEL: func @matmul_transposed_broadcasted_2d +// CHECK-DAG: %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index, transpose} : memref<32x32xf16> -> !gpu.mma_matrix<16x16xf16, "AOp"> +// CHECK-DAG: %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index} : memref<32x32xf16> -> !gpu.mma_matrix<16x16xf16, "BOp"> +// CHECK-DAG: %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp"> +// CHECK: gpu.subgroup_mma_store_matrix %[[D]], %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16> +func.func @matmul_transposed_broadcasted_2d(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: memref<16x16xf16>) { + %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16> + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16> + %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16> + %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> + %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16> + vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16> + return +}