From 086187351aed5236501d0a23af80fd03b64f49bd Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Fri, 19 Jul 2024 20:33:11 -0700 Subject: [PATCH 1/8] [MLIR][Linalg] Scalable Vectorization of Reduction on the Trailing Dimension Allow scalable vectorization of linalg::reduce and linalg::generic with reduction iterator. For now, only reduction on the trailing dimension is supported. --- .../Linalg/Transforms/Vectorization.cpp | 25 ++++-- .../Linalg/vectorization-scalable.mlir | 82 +++++++++++++++++++ .../Linalg/vectorization-unsupported.mlir | 20 ++--- 3 files changed, 112 insertions(+), 15 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 7f7168eb86832..b2324d8aaf305 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -586,6 +586,12 @@ static SmallVector getDimsToReduce(LinalgOp linalgOp) { llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator)); } +static bool hasLinalgReduction(LinalgOp &op) { + return isa(op) || + (isa(op) && + llvm::any_of(op.getIteratorTypesArray(), isReductionIterator)); +} + /// Build a vector.transfer_write of `value` into `outputOperand` at indices set /// to all `0`; where `outputOperand` is an output operand of the LinalgOp /// currently being vectorized. If `dest` has null rank, build an memref.store. @@ -1787,6 +1793,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, if (isa(op.getOperation())) return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv); + if (hasLinalgReduction(op)) + return reductionPreconditions(op); + // TODO: Masking only supports dynamic element-wise ops, linalg.generic ops, // linalg.copy ops and ops that implement ContractionOpInterface for now. if (!isElementwise(op) && @@ -1976,6 +1985,7 @@ vectorizeScalableVectorPrecondition(Operation *op, // 1. exactly 1 dim is scalable and that's the _last_ parallel dim // 2. exactly 2 dims are scalable and those are the _last two adjacent_ // parallel dims + // 3. exactly 1 reduction dim is scalable and that's the last (innermost) dim // The 2nd restriction above means that only Matmul-like Ops are supported // when 2 dims are scalable, e.g. : // * iterators = [parallel, parallel, reduction] @@ -1992,11 +2002,15 @@ vectorizeScalableVectorPrecondition(Operation *op, scalableFlags.pop_back(); } - // TODO: Support scalable vectorisation for reduction dims - if (iterators.back() == utils::IteratorType::reduction) - return failure(); + if (iterators.back() == utils::IteratorType::reduction) { + if (iterators.size() != inputVectorSizes.size()) { + LDBG("Non-trailing reduction dim requested for scalable " + "vectorization\n"); + return failure(); + } + } - // If this is not the _last_ parallel dim, 1. above is not met + // If this is not the _last_ parallel dim, 1. or 3. above is not met if (seenParalell) return failure(); @@ -2017,7 +2031,8 @@ vectorizeScalableVectorPrecondition(Operation *op, // presence of scalable vectors return success(isElementwise(linalgOp) || isa(op) || isa(op) || - isa(op)); + isa(op) || + hasLinalgReduction(linalgOp)); } LogicalResult mlir::linalg::vectorizeOpPrecondition( diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir index 4423ee6ea6a51..c29d8816d5f81 100644 --- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir @@ -189,3 +189,85 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor, + %arg1: tensor) -> tensor { + + %0 = linalg.reduce ins(%arg0 : tensor) outs(%arg1 : tensor) dimensions = [0] + (%in: f32, %init: f32) { + %0 = arith.addf %in, %init : f32 + linalg.yield %0 : f32 + } + return %0 : tensor +} + +// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d( +// CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> tensor { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1> +// CHECK: %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> +// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor, vector +// CHECK: %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector +// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction , %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32 +// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector +// CHECK: %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector, tensor +// CHECK: return %[[VAL_11]] : tensor +// CHECK: } + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op + transform.yield + } +} + +// ----- + +// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir. +func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor, + %arg1: tensor) -> tensor { + %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0)>], + iterator_types = ["parallel", "reduction"] } + ins(%arg0 : tensor) + outs(%arg1 : tensor) { + ^bb(%in: f32, %out: f32) : + %0 = arith.addf %in, %out : f32 + linalg.yield %0 : f32 + } -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d( +// CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> tensor { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1> +// CHECK: %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32> +// CHECK: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1> +// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor, vector<1xf32> } : vector<1xi1> -> vector<1xf32> +// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction , %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32> +// CHECK: %[[VAL_12:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor } : vector<1xi1> -> tensor +// CHECK: return %[[VAL_13]] : tensor +// CHECK: } + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index c7ec39b0dbfb3..164e7b23b1a1c 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -129,35 +129,35 @@ module attributes {transform.with_named_sequence} { // ----- -func.func @linalg_reduce_scalable(%input: tensor, - %acc: tensor) -> tensor { +func.func @linalg_reduce_scalable_leading_dim(%input: tensor, + %acc: tensor) -> tensor { // expected-error @+1 {{Attempted to vectorize, but failed}} - %0 = linalg.reduce ins(%input : tensor) outs(%acc : tensor) dimensions = [0] + %0 = linalg.reduce ins(%input : tensor) outs(%acc : tensor) dimensions = [0] (%in: f32, %init: f32) { %0 = arith.addf %in, %init : f32 linalg.yield %0 : f32 } - return %0 : tensor + return %0 : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op transform.yield } } // ----- -func.func @linalg_generic_scalable_reduction_dim(%input: tensor, - %acc: tensor) -> tensor { +func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor, + %acc: tensor) -> tensor { // expected-error @+1 {{Attempted to vectorize, but failed}} %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"] } + affine_map<(d0, d1) -> (d1)>], + iterator_types = ["reduction", "parallel"] } ins(%input : tensor) outs(%acc : tensor) { ^bb(%in: f32, %out: f32) : @@ -170,7 +170,7 @@ func.func @linalg_generic_scalable_reduction_dim(%input: tensor, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op transform.yield } } From fba222e9377302c8263a847ba30268c334d2c5bf Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Fri, 19 Jul 2024 20:40:15 -0700 Subject: [PATCH 2/8] [MLIR][Linalg] Add integration tests of scalable vectorization of reduction Note: I don't have a setup to run these tests natively (arm64-linux with sve). I am able to run them using QEMU on a x86_64-linux with below cmake variables when building llvm: -DARM_EMULATOR_EXECUTABLE="/qemu-aarch64" \ -DARM_EMULATOR_OPTIONS="-L /usr/aarch64-linux-gnu" \ -DARM_EMULATOR_MLIR_CPU_RUNNER_EXECUTABLE="/bin/mlir-cpu-runner-arm64" \ -DARM_EMULATOR_UTILS_LIB_DIR="/lib" --- .../Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir | 134 +++++++++++++++++ .../Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir | 136 ++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir new file mode 100644 index 0000000000000..4bcb2ef79da83 --- /dev/null +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir @@ -0,0 +1,134 @@ +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ +// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \ +// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t +// DEFINE: %{entry_point} = reduce_1d_f32 +// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ +// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext + +// RUN: %{compile} + +// RUN: %{run} | FileCheck %s --check-prefix=REDUCE + +// REDEFINE: %{entry_point} = generic_reduce_1d_f32 +// RUN: %{run} | FileCheck %s --check-prefix=GENERIC + +func.func @reduce_1d_f32() { + // 1-D Tensor + %N = arith.constant 1000 : index + %c0_f32 = arith.constant 0.0 : f32 + + // Allocate the input and output tensors + %A_alloc = bufferization.alloc_tensor(%N) : tensor + %C_alloc = bufferization.alloc_tensor() : tensor + + // Initialise the tensors + %pi = arith.constant 3.1416 : f32 + %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor + %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor + + // Reduce + %C_out = linalg.reduce ins(%A_in : tensor) outs(%C_in: tensor) dimensions = [0] + (%in: f32, %init: f32) { + %0 = arith.addf %in, %init : f32 + linalg.yield %0 : f32 + } + + // Print and verify the output + // REDUCE-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT\n" + + // REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = + // REDUCE-NEXT: [3141.6] + + %xf = tensor.cast %C_out : tensor to tensor<*xf32> + call @printMemrefF32(%xf) : (tensor<*xf32>) -> () + + // REDUCE-NEXT: SVE: END OF TEST OUTPUT + vector.print str "SVE: END OF TEST OUTPUT\n" + + return +} + +func.func @generic_reduce_1d_f32() { + // 1-D Tensor + %N = arith.constant 1000 : index + %c0_f32 = arith.constant 0.0 : f32 + + // Allocate the input and output tensors + %A_alloc = bufferization.alloc_tensor(%N) : tensor + %C_alloc = bufferization.alloc_tensor() : tensor + + // Initialise the tensors + %pi = arith.constant 3.1416 : f32 + %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor + %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor + + // Reduce + %C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>, + affine_map<(d0) -> ()>], + iterator_types = ["reduction"] } + ins(%A_in : tensor) + outs(%C_in : tensor) { + ^bb(%in: f32, %out: f32) : + %0 = arith.addf %in, %out : f32 + linalg.yield %0 : f32 + } -> tensor + + // Print and verify the output + // GENERIC-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT\n" + + // GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = + // GENERIC-NEXT: [3141.6] + + %xf = tensor.cast %C_out : tensor to tensor<*xf32> + call @printMemrefF32(%xf) : (tensor<*xf32>) -> () + + // GENERIC-NEXT: SVE: END OF TEST OUTPUT + vector.print str "SVE: END OF TEST OUTPUT\n" + + return +} + +module attributes {transform.with_named_sequence} { + // A sequence that will tile and vectorise a Reduce Op + transform.named_sequence @tile_and_vectorize_reduce(%func + : !transform.op<"func.func"> {transform.readonly}) { + + // Step 0: Get a handle to the reduce Op + %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func + : (!transform.op<"func.func">) -> !transform.any_op + + // Step 1: Tile + %tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Step 2: Vectorize + transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op + + // Step 3: Lower vector.multi_reduction + transform.apply_patterns to %func { + transform.apply_patterns.vector.lower_masked_transfers + transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction" + } : !transform.op<"func.func"> + + transform.yield + } + + // A sequence that goes over all functions in tis module and applies + // "tile_and_vectorize_reduce" + transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { + %funcs = transform.structured.match ops{["func.func"]} in %module + : (!transform.any_op) -> !transform.op<"func.func"> + + transform.foreach %funcs : !transform.op<"func.func"> { + ^bb2(%func : !transform.op<"func.func">): + transform.include @tile_and_vectorize_reduce failures(propagate) + (%func) : (!transform.op<"func.func">) -> () + } + transform.yield + } +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir new file mode 100644 index 0000000000000..63d0ac5126e66 --- /dev/null +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir @@ -0,0 +1,136 @@ +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ +// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \ +// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t +// DEFINE: %{entry_point} = reduce_2d_f32 +// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ +// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext + +// RUN: %{compile} + +// RUN: %{run} | FileCheck %s --check-prefix=REDUCE + +// REDEFINE: %{entry_point} = generic_reduce_2d_f32 +// RUN: %{run} | FileCheck %s --check-prefix=GENERIC + +func.func @reduce_2d_f32() { + // 2-D Tensor + %M = arith.constant 16 : index + %N = arith.constant 1000 : index + %c0_f32 = arith.constant 0.0 : f32 + + // Allocate the input and output tensors + %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor + %C_alloc = bufferization.alloc_tensor(%M) : tensor + + // Initialise the tensors + %pi = arith.constant 3.1416 : f32 + %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor + %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor) -> tensor + + // Reduce + %C_out = linalg.reduce ins(%A_in : tensor) outs(%C_in: tensor) dimensions = [1] + (%in: f32, %init: f32) { + %0 = arith.addf %in, %init : f32 + linalg.yield %0 : f32 + } + + // Print and verify the output + // REDUCE-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT\n" + + // REDUCE-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data = + // REDUCE-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6] + + %xf = tensor.cast %C_out : tensor to tensor<*xf32> + call @printMemrefF32(%xf) : (tensor<*xf32>) -> () + + // REDUCE-NEXT: SVE: END OF TEST OUTPUT + vector.print str "SVE: END OF TEST OUTPUT\n" + + return +} + +func.func @generic_reduce_2d_f32() { + // 2-D Tensor + %M = arith.constant 16 : index + %N = arith.constant 1000 : index + %c0_f32 = arith.constant 0.0 : f32 + + // Allocate the input and output tensors + %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor + %C_alloc = bufferization.alloc_tensor(%M) : tensor + + // Initialise the tensors + %pi = arith.constant 3.1416 : f32 + %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor + %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor) -> tensor + + // Reduce + %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0)>], + iterator_types = ["parallel", "reduction"] } + ins(%A_in : tensor) + outs(%C_in : tensor) { + ^bb(%in: f32, %out: f32) : + %0 = arith.addf %in, %out : f32 + linalg.yield %0 : f32 + } -> tensor + + // Print and verify the output + // GENERIC-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT\n" + + // GENERIC-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data = + // GENERIC-NEXT: [3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6, 3141.6] + + %xf = tensor.cast %C_out : tensor to tensor<*xf32> + call @printMemrefF32(%xf) : (tensor<*xf32>) -> () + + // GENERIC-NEXT: SVE: END OF TEST OUTPUT + vector.print str "SVE: END OF TEST OUTPUT\n" + + return +} + +module attributes {transform.with_named_sequence} { + // A sequence that will tile and vectorise a Reduce Op + transform.named_sequence @tile_and_vectorize_reduce(%func + : !transform.op<"func.func"> {transform.readonly}) { + + // Step 0: Get a handle to the reduce Op + %reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func + : (!transform.op<"func.func">) -> !transform.any_op + + // Step 1: Tile + %tiled_reduce, %loops:2 = transform.structured.tile_using_for %reduce tile_sizes [1, [4]] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + + // Step 2: Vectorize + transform.structured.vectorize %tiled_reduce vector_sizes [1, [4]] : !transform.any_op + + // Step 3: Lower vector.multi_reduction + transform.apply_patterns to %func { + transform.apply_patterns.vector.lower_masked_transfers + transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction" + } : !transform.op<"func.func"> + + transform.yield + } + + // A sequence that goes over all functions in tis module and applies + // "tile_and_vectorize_reduce" + transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { + %funcs = transform.structured.match ops{["func.func"]} in %module + : (!transform.any_op) -> !transform.op<"func.func"> + + transform.foreach %funcs : !transform.op<"func.func"> { + ^bb2(%func : !transform.op<"func.func">): + transform.include @tile_and_vectorize_reduce failures(propagate) + (%func) : (!transform.op<"func.func">) -> () + } + transform.yield + } +} + +func.func private @printMemrefF32(%ptr : tensor<*xf32>) From 1e5ef34b12ce814d3b136cb38b5ccd98a0dd4a78 Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Sat, 20 Jul 2024 14:21:30 -0700 Subject: [PATCH 3/8] fix per clang-format --- mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index b2324d8aaf305..7e3048b15fb9a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2004,9 +2004,9 @@ vectorizeScalableVectorPrecondition(Operation *op, if (iterators.back() == utils::IteratorType::reduction) { if (iterators.size() != inputVectorSizes.size()) { - LDBG("Non-trailing reduction dim requested for scalable " - "vectorization\n"); - return failure(); + LDBG("Non-trailing reduction dim requested for scalable " + "vectorization\n"); + return failure(); } } From 75f0da224a077ddc8614e5a3ad085444e6f98935 Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Sun, 21 Jul 2024 17:45:16 -0700 Subject: [PATCH 4/8] Addressed review comments --- .../Linalg/Transforms/Vectorization.cpp | 35 ++++++---- .../Linalg/vectorization-scalable.mlir | 60 ++++++++--------- .../Linalg/vectorization-unsupported.mlir | 2 +- .../Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir | 65 +++++++++++++++---- .../Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir | 48 +++++++++++++- 5 files changed, 151 insertions(+), 59 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 7e3048b15fb9a..d17fae307e817 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -586,7 +586,9 @@ static SmallVector getDimsToReduce(LinalgOp linalgOp) { llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator)); } -static bool hasLinalgReduction(LinalgOp &op) { +/// Check if `op` is a linalg.reduce or a linalg.generic that has at least one +/// reduction iterator. +static bool hasReductionIterator(LinalgOp &op) { return isa(op) || (isa(op) && llvm::any_of(op.getIteratorTypesArray(), isReductionIterator)); @@ -1793,7 +1795,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, if (isa(op.getOperation())) return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv); - if (hasLinalgReduction(op)) + if (hasReductionIterator(op)) return reductionPreconditions(op); // TODO: Masking only supports dynamic element-wise ops, linalg.generic ops, @@ -2002,18 +2004,27 @@ vectorizeScalableVectorPrecondition(Operation *op, scalableFlags.pop_back(); } - if (iterators.back() == utils::IteratorType::reduction) { - if (iterators.size() != inputVectorSizes.size()) { - LDBG("Non-trailing reduction dim requested for scalable " - "vectorization\n"); - return failure(); + switch (iterators.back()) { + case utils::IteratorType::reduction: { + // Check 3. above is met. + if (iterators.size() != inputVectorSizes.size()) { + LDBG("Non-trailing reduction dim requested for scalable " + "vectorization\n"); + return failure(); + } + break; + } + case utils::IteratorType::parallel: { + // Check 1. and 2. above are met. + if (seenParalell) { + LDBG("Inner parallel dim not requested for scalable " + "vectorization\n"); + return failure(); + } + break; } } - // If this is not the _last_ parallel dim, 1. or 3. above is not met - if (seenParalell) - return failure(); - // If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are // supported for which expect the folowing config: // * iterators = [parallel, parallel, reduction] @@ -2032,7 +2043,7 @@ vectorizeScalableVectorPrecondition(Operation *op, return success(isElementwise(linalgOp) || isa(op) || isa(op) || isa(op) || - hasLinalgReduction(linalgOp)); + hasReductionIterator(linalgOp)); } LogicalResult mlir::linalg::vectorizeOpPrecondition( diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir index c29d8816d5f81..df2f8d434f36b 100644 --- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir @@ -193,7 +193,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor, - %arg1: tensor) -> tensor { + %arg1: tensor) -> tensor { %0 = linalg.reduce ins(%arg0 : tensor) outs(%arg1 : tensor) dimensions = [0] (%in: f32, %init: f32) { @@ -205,20 +205,18 @@ func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor, // CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d( // CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> tensor { -// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor -// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1> -// CHECK: %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> -// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor, vector -// CHECK: %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector -// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction , %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32 -// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector -// CHECK: %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector, tensor -// CHECK: return %[[VAL_11]] : tensor -// CHECK: } +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1> +// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> +// CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VEC_RD_1:.*]] = vector.transfer_read %[[ARG_1]][], %[[C0_F32]] : tensor, vector +// CHECK: %[[ACC_f32:.*]] = vector.extractelement %[[VEC_RD_1]][] : vector +// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK]] { vector.multi_reduction , %[[VEC_RD_0]], %[[ACC_f32]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32 +// CHECK: %[[VEC_f32:.*]] = vector.broadcast %[[REDUCE]] : f32 to vector +// CHECK: %{{.*}} = vector.transfer_write %[[VEC_f32]], %[[ARG_1]][] : vector, tensor module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { @@ -247,27 +245,25 @@ func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor, // CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d( // CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> tensor { -// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor -// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor -// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1> -// CHECK: %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32> -// CHECK: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1> -// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor, vector<1xf32> } : vector<1xi1> -> vector<1xf32> -// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction , %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32> -// CHECK: %[[VAL_12:.*]] = arith.constant 0 : index -// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor } : vector<1xi1> -> tensor -// CHECK: return %[[VAL_13]] : tensor -// CHECK: } +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor +// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index +// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[8]xi1> +// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor, vector<4x[8]xf32> } : vector<4x[8]xi1> -> vector<4x[8]xf32> +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_1d:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1> +// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_1d]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor, vector<4xf32> } : vector<4xi1> -> vector<4xf32> +// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction , %[[VEC_RD_0]], %[[VEC_RD_1]] [1] : vector<4x[8]xf32> to vector<4xf32> } : vector<4x[8]xi1> -> vector<4xf32> +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %{{.*}} = vector.mask %[[MASK_1d]] { vector.transfer_write %[[REDUCE]], %[[ARG_1]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor } : vector<4xi1> -> tensor module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op + transform.structured.vectorize %0 vector_sizes [4, [8]] : !transform.any_op transform.yield } } diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 164e7b23b1a1c..6c4de1635028f 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -151,7 +151,7 @@ module attributes {transform.with_named_sequence} { // ----- -func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor, +func.func @linalg_generic_reduction_scalable_leading_dim(%input: tensor, %acc: tensor) -> tensor { // expected-error @+1 {{Attempted to vectorize, but failed}} diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir index 4bcb2ef79da83..7cdb35918c4c0 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir @@ -8,10 +8,13 @@ // RUN: %{compile} -// RUN: %{run} | FileCheck %s --check-prefix=REDUCE +// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-F32 + +// REDEFINE: %{entry_point} = reduce_1d_i32 +// RUN: %{run} | FileCheck %s --check-prefix=REDUCE-I32 // REDEFINE: %{entry_point} = generic_reduce_1d_f32 -// RUN: %{run} | FileCheck %s --check-prefix=GENERIC +// RUN: %{run} | FileCheck %s --check-prefix=GENERIC-F32 func.func @reduce_1d_f32() { // 1-D Tensor @@ -23,7 +26,7 @@ func.func @reduce_1d_f32() { %C_alloc = bufferization.alloc_tensor() : tensor // Initialise the tensors - %pi = arith.constant 3.1416 : f32 + %pi = arith.constant 3.1416 : f32 %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor @@ -35,16 +38,53 @@ func.func @reduce_1d_f32() { } // Print and verify the output - // REDUCE-LABEL: SVE: START OF TEST OUTPUT + // REDUCE-F32-LABEL: SVE: START OF TEST OUTPUT vector.print str "SVE: START OF TEST OUTPUT\n" - // REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = - // REDUCE-NEXT: [3141.6] + // REDUCE-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = + // REDUCE-F32-NEXT: [3141.6] %xf = tensor.cast %C_out : tensor to tensor<*xf32> call @printMemrefF32(%xf) : (tensor<*xf32>) -> () - // REDUCE-NEXT: SVE: END OF TEST OUTPUT + // REDUCE-F32-NEXT: SVE: END OF TEST OUTPUT + vector.print str "SVE: END OF TEST OUTPUT\n" + + return +} + +func.func @reduce_1d_i32() { + // 1-D Tensor + %N = arith.constant 1000 : index + %c0_i32 = arith.constant 0 : i32 + + // Allocate the input and output tensors + %A_alloc = bufferization.alloc_tensor(%N) : tensor + %C_alloc = bufferization.alloc_tensor() : tensor + + // Initialise the tensors + %pi = arith.constant 3 : i32 + %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor) -> tensor + %C_in = tensor.insert %c0_i32 into %C_alloc[] : tensor + + // Reduce + %C_out = linalg.reduce ins(%A_in : tensor) outs(%C_in: tensor) dimensions = [0] + (%in: i32, %init: i32) { + %0 = arith.addi %in, %init : i32 + linalg.yield %0 : i32 + } + + // Print and verify the output + // REDUCE-I32-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT\n" + + // REDUCE-I32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = + // REDUCE-I32-NEXT: [3000] + + %xf = tensor.cast %C_out : tensor to tensor<*xi32> + call @printMemrefI32(%xf) : (tensor<*xi32>) -> () + + // REDUCE-I32-NEXT: SVE: END OF TEST OUTPUT vector.print str "SVE: END OF TEST OUTPUT\n" return @@ -60,7 +100,7 @@ func.func @generic_reduce_1d_f32() { %C_alloc = bufferization.alloc_tensor() : tensor // Initialise the tensors - %pi = arith.constant 3.1416 : f32 + %pi = arith.constant 3.1416 : f32 %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor %C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor @@ -76,16 +116,16 @@ func.func @generic_reduce_1d_f32() { } -> tensor // Print and verify the output - // GENERIC-LABEL: SVE: START OF TEST OUTPUT + // GENERIC-F32-LABEL: SVE: START OF TEST OUTPUT vector.print str "SVE: START OF TEST OUTPUT\n" - // GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = - // GENERIC-NEXT: [3141.6] + // GENERIC-F32-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = + // GENERIC-F32-NEXT: [3141.6] %xf = tensor.cast %C_out : tensor to tensor<*xf32> call @printMemrefF32(%xf) : (tensor<*xf32>) -> () - // GENERIC-NEXT: SVE: END OF TEST OUTPUT + // GENERIC-F32-NEXT: SVE: END OF TEST OUTPUT vector.print str "SVE: END OF TEST OUTPUT\n" return @@ -132,3 +172,4 @@ module attributes {transform.with_named_sequence} { } func.func private @printMemrefF32(%ptr : tensor<*xf32>) +func.func private @printMemrefI32(%ptr : tensor<*xi32>) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir index 63d0ac5126e66..bcfe12e374b4e 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_2d.mlir @@ -24,7 +24,7 @@ func.func @reduce_2d_f32() { %C_alloc = bufferization.alloc_tensor(%M) : tensor // Initialise the tensors - %pi = arith.constant 3.1416 : f32 + %pi = arith.constant 3.1416 : f32 %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor) -> tensor @@ -62,7 +62,7 @@ func.func @generic_reduce_2d_f32() { %C_alloc = bufferization.alloc_tensor(%M) : tensor // Initialise the tensors - %pi = arith.constant 3.1416 : f32 + %pi = arith.constant 3.1416 : f32 %A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor) -> tensor %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor) -> tensor @@ -93,6 +93,49 @@ func.func @generic_reduce_2d_f32() { return } +func.func @generic_reduce_2d_i32() { + // 2-D Tensor + %M = arith.constant 16 : index + %N = arith.constant 1000 : index + %c0_i32 = arith.constant 0 : i32 + + // Allocate the input and output tensors + %A_alloc = bufferization.alloc_tensor(%M, %N) : tensor + %C_alloc = bufferization.alloc_tensor(%M) : tensor + + // Initialise the tensors + %pi = arith.constant 3 : i32 + %A_in = linalg.fill ins(%pi : i32) outs(%A_alloc : tensor) -> tensor + %C_in = linalg.fill ins(%c0_i32 : i32) outs(%C_alloc : tensor) -> tensor + + // Reduce + %C_out = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0)>], + iterator_types = ["parallel", "reduction"] } + ins(%A_in : tensor) + outs(%C_in : tensor) { + ^bb(%in: i32, %out: i32) : + %0 = arith.addi %in, %out : i32 + linalg.yield %0 : i32 + } -> tensor + + // Print and verify the output + // GENERIC-I32-LABEL: SVE: START OF TEST OUTPUT + vector.print str "SVE: START OF TEST OUTPUT\n" + + // GENERIC-I32-NEXT: Unranked Memref {{.*}} rank = 1 offset = 0 sizes = [16] strides = [1] data = + // GENERIC-I32-NEXT: [3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000] + + %xf = tensor.cast %C_out : tensor to tensor<*xi32> + call @printMemrefI32(%xf) : (tensor<*xi32>) -> () + + // GENERIC-I32-NEXT: SVE: END OF TEST OUTPUT + vector.print str "SVE: END OF TEST OUTPUT\n" + + return +} + + module attributes {transform.with_named_sequence} { // A sequence that will tile and vectorise a Reduce Op transform.named_sequence @tile_and_vectorize_reduce(%func @@ -134,3 +177,4 @@ module attributes {transform.with_named_sequence} { } func.func private @printMemrefF32(%ptr : tensor<*xf32>) +func.func private @printMemrefI32(%ptr : tensor<*xi32>) From 00c683a9eca580f9b6c3726a00cdcf680a07717f Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Mon, 22 Jul 2024 15:57:10 -0700 Subject: [PATCH 5/8] Handle Matmul and Matvec's reduction dim on scalable vectorization In summary: 1. Do not allow scalable vectorization of the reduction dim of Matmul-like ops. 2. Allow scalable vectorization on only one dim of Matvec op. Allowed combinations of scalable flags and iterator types: Matmul: Iterators: ["parallel", "parallel", "reduction"] Scalable Flags: ["true", "true", "false"] ["false", "true", "false"] Matvec: Iterators: ["parallel", "reduction"] Scalable Flags: ["false", "true"] ["true", "false"] --- .../Linalg/Transforms/Vectorization.cpp | 15 ++++ .../Linalg/vectorization-scalable.mlir | 89 ++++++++++++++++++- .../Linalg/vectorization-unsupported.mlir | 64 ++++++++++++- 3 files changed, 166 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index d17fae307e817..e0af708cb6d70 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2012,6 +2012,12 @@ vectorizeScalableVectorPrecondition(Operation *op, "vectorization\n"); return failure(); } + if (isa(op) || + isa(op)) { + LDBG("Scalable vectorization of the reduction dim in Matmul-like ops " + "is not supported\n"); + return failure(); + } break; } case utils::IteratorType::parallel: { @@ -2030,6 +2036,14 @@ vectorizeScalableVectorPrecondition(Operation *op, // * iterators = [parallel, parallel, reduction] // * scalable flags = [true, true, false] if (numOfScalableDims == 2) { + // Disallow below case which breaks 3. above: + // * iterators = [..., parallel, reduction] + // * scalable flags = [..., true, true] + if (iterators.back() == utils::IteratorType::reduction) { + LDBG("Higher dim than the trailing reduction dim requested for scalable " + "vectorization\n"); + return failure(); + } scalableFlags.pop_back(); iterators.pop_back(); @@ -2043,6 +2057,7 @@ vectorizeScalableVectorPrecondition(Operation *op, return success(isElementwise(linalgOp) || isa(op) || isa(op) || isa(op) || + isa(op) || hasReductionIterator(linalgOp)); } diff --git a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir index df2f8d434f36b..4ee3088cc3778 100644 --- a/mlir/test/Dialect/Linalg/vectorization-scalable.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-scalable.mlir @@ -230,7 +230,7 @@ module attributes {transform.with_named_sequence} { // Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir. func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor, - %arg1: tensor) -> tensor { + %arg1: tensor) -> tensor { %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"] } @@ -267,3 +267,90 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func @vectorize_dynamic_matvec_trailing_reduction_dim(%arg0: tensor, + %arg1: tensor, + %arg2: tensor) { + linalg.matvec ins(%arg0, %arg1 : tensor, tensor) + outs(%arg2 : tensor) -> tensor + return +} + +// CHECK-LABEL: func.func @vectorize_dynamic_matvec_trailing_reduction_dim( +// CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor, %[[ARG_2:.*]]: tensor) { +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor +// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index +// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<4x[4]xi1> +// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor, vector<4x[4]xf32> } : vector<4x[4]xi1> -> vector<4x[4]xf32> +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<[4]xi1> +// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor, vector<4x[4]xf32> } : vector<[4]xi1> -> vector<4x[4]xf32> +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<4xi1> +// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor, vector<4xf32> } : vector<4xi1> -> vector<4xf32> +// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<4x[4]xf32> +// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction , %[[MUL]], %[[VEC_RD_2]] [1] : vector<4x[4]xf32> to vector<4xf32> } : vector<4x[4]xi1> -> vector<4xf32> +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<4xf32>, tensor } : vector<4xi1> -> tensor + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [4, [4]] : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim(%arg0: tensor, + %arg1: tensor, + %arg2: tensor) -> tensor { + %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d1)>, + affine_map<(d0, d1) -> (d0)>], + iterator_types = ["parallel", "reduction"] } + ins(%arg0, %arg1 : tensor, tensor) + outs(%arg2 : tensor) { + ^bb(%mat: f32, %vec: f32, %res: f32) : + %0 = arith.mulf %mat, %vec : f32 + %1 = arith.addf %res, %0 : f32 + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @vectorize_dynamic_generic_matvec_leading_parallel_dim( +// CHECK-SAME: %[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor, %[[ARG_2:.*]]: tensor) -> tensor { +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[DIM_A0_0:.*]] = tensor.dim %[[ARG_0]], %[[C0_idx]] : tensor +// CHECK: %[[C1_idx:.*]] = arith.constant 1 : index +// CHECK: %[[DIM_A0_1:.*]] = tensor.dim %[[ARG_0]], %[[C1_idx]] : tensor +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_2d:.*]] = vector.create_mask %[[DIM_A0_0]], %[[DIM_A0_1]] : vector<[4]x4xi1> +// CHECK: %[[VEC_RD_0:.*]] = vector.mask %[[MASK_2d]] { vector.transfer_read %[[ARG_0]][%[[C0_idx]], %[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true]} : tensor, vector<[4]x4xf32> } : vector<[4]x4xi1> -> vector<[4]x4xf32> +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_d1:.*]] = vector.create_mask %[[DIM_A0_1]] : vector<4xi1> +// CHECK: %[[VEC_RD_1:.*]] = vector.mask %[[MASK_d1]] { vector.transfer_read %[[ARG_1]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true, true], permutation_map = #map} : tensor, vector<[4]x4xf32> } : vector<4xi1> -> vector<[4]x4xf32> +// CHECK: %[[C0_f32:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[MASK_d2:.*]] = vector.create_mask %[[DIM_A0_0]] : vector<[4]xi1> +// CHECK: %[[VEC_RD_2:.*]] = vector.mask %[[MASK_d2]] { vector.transfer_read %[[ARG_2]][%[[C0_idx]]], %[[C0_f32]] {in_bounds = [true]} : tensor, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> +// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_RD_0:.*]], %[[VEC_RD_1:.*]] : vector<[4]x4xf32> +// CHECK: %[[REDUCE:.*]] = vector.mask %[[MASK_2d]] { vector.multi_reduction , %[[MUL]], %[[VEC_RD_2]] [1] : vector<[4]x4xf32> to vector<[4]xf32> } : vector<[4]x4xi1> -> vector<[4]xf32> +// CHECK: %[[C0_idx:.*]] = arith.constant 0 : index +// CHECK: %{{.*}} = vector.mask %[[MASK_d2]] { vector.transfer_write %[[REDUCE]], %[[ARG_2]][%[[C0_idx]]] {in_bounds = [true]} : vector<[4]xf32>, tensor } : vector<[4]xi1> -> tensor + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [[4], 4] : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 6c4de1635028f..e9f8e08ca0c6b 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -177,10 +177,27 @@ module attributes {transform.with_named_sequence} { // ----- +func.func @linalg_matvec_scalable_two_dims(%A: memref, %B: memref, %C: memref) { + // expected-error @+1 {{Attempted to vectorize, but failed}} + linalg.matvec ins(%A, %B: memref, memref) + outs(%C: memref) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %matmul = transform.structured.match ops{["linalg.matvec"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %matmul vector_sizes [[4], [4]] : !transform.any_op + transform.yield + } +} + +// ----- + func.func @linalg_matmul_scalable_leading_parallel_dim(%A: memref, %B: memref, %C: memref) { // expected-error @+1 {{Attempted to vectorize, but failed}} linalg.matmul ins(%A, %B: memref, memref) - outs(%C: memref) + outs(%C: memref) return } @@ -191,3 +208,48 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func @linalg_matmul_scalable_trailing_reduction_dim(%A: memref, %B: memref, %C: memref) { + // expected-error @+1 {{Attempted to vectorize, but failed}} + linalg.matmul ins(%A, %B: memref, memref) + outs(%C: memref) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %matmul vector_sizes [8, 16, [4]] : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @linalg_generic_matmul_scalable_two_trailing_dims(%A: tensor, %B: tensor<64x?xf32>, + %C: tensor) -> tensor { + + // expected-error @+1 {{Attempted to vectorize, but failed}} + %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d2, d1)>, + affine_map<(d0, d1, d2) -> (d0, d1)>], + iterator_types = ["parallel", "parallel", "reduction"] } + ins(%A, %B : tensor, tensor<64x?xf32>) + outs(%C: tensor) { + ^bb(%in1: f32, %in2: f32, %out: f32) : + %0 = arith.mulf %in1, %in2 : f32 + %1 = arith.addf %0, %out : f32 + linalg.yield %1 : f32 + } -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, [4], [4]] : !transform.any_op + transform.yield + } +} From 5a4ac6d67542c2a259ed3c7c9dcf8bf3c34dfbb9 Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Mon, 22 Jul 2024 16:19:45 -0700 Subject: [PATCH 6/8] update per clang-format --- mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index e0af708cb6d70..d983b52a6c2dc 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2057,8 +2057,7 @@ vectorizeScalableVectorPrecondition(Operation *op, return success(isElementwise(linalgOp) || isa(op) || isa(op) || isa(op) || - isa(op) || - hasReductionIterator(linalgOp)); + isa(op) || hasReductionIterator(linalgOp)); } LogicalResult mlir::linalg::vectorizeOpPrecondition( From 7c71012335f363662c76a135bd5c4d771be9b02f Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Mon, 22 Jul 2024 16:36:46 -0700 Subject: [PATCH 7/8] 2nd update per clang-format --- .../Linalg/Transforms/Vectorization.cpp | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index d983b52a6c2dc..165e5d1d0c59b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2005,30 +2005,30 @@ vectorizeScalableVectorPrecondition(Operation *op, } switch (iterators.back()) { - case utils::IteratorType::reduction: { - // Check 3. above is met. - if (iterators.size() != inputVectorSizes.size()) { - LDBG("Non-trailing reduction dim requested for scalable " - "vectorization\n"); - return failure(); - } - if (isa(op) || - isa(op)) { - LDBG("Scalable vectorization of the reduction dim in Matmul-like ops " - "is not supported\n"); - return failure(); - } - break; + case utils::IteratorType::reduction: { + // Check 3. above is met. + if (iterators.size() != inputVectorSizes.size()) { + LDBG("Non-trailing reduction dim requested for scalable " + "vectorization\n"); + return failure(); } - case utils::IteratorType::parallel: { - // Check 1. and 2. above are met. - if (seenParalell) { - LDBG("Inner parallel dim not requested for scalable " - "vectorization\n"); - return failure(); - } - break; + if (isa(op) || + isa(op)) { + LDBG("Scalable vectorization of the reduction dim in Matmul-like ops " + "is not supported\n"); + return failure(); + } + break; + } + case utils::IteratorType::parallel: { + // Check 1. and 2. above are met. + if (seenParalell) { + LDBG("Inner parallel dim not requested for scalable " + "vectorization\n"); + return failure(); } + break; + } } // If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are From 0044740046375d3fe5894269e42c391bc4a98e74 Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Mon, 22 Jul 2024 16:43:30 -0700 Subject: [PATCH 8/8] another update per clang-format --- mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 165e5d1d0c59b..c4dab7d061b4b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2012,8 +2012,7 @@ vectorizeScalableVectorPrecondition(Operation *op, "vectorization\n"); return failure(); } - if (isa(op) || - isa(op)) { + if (isa(op) || isa(op)) { LDBG("Scalable vectorization of the reduction dim in Matmul-like ops " "is not supported\n"); return failure();