From 95260b8b1a22adfe4f0412f67b8fedea2d200294 Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Mon, 24 Nov 2025 06:32:47 -0800 Subject: [PATCH 1/4] sink vector transfer reads and loads before the consumer --- .../TransformOps/X86VectorTransformOps.td | 11 ++ .../mlir/Dialect/X86Vector/Transforms.h | 4 + .../TransformOps/X86VectorTransformOps.cpp | 5 + .../X86Vector/Transforms/CMakeLists.txt | 1 + .../Transforms/SinkVectorProducerOps.cpp | 87 ++++++++++ .../X86Vector/sink-vector-producer-ops.mlir | 154 ++++++++++++++++++ 6 files changed, 262 insertions(+) create mode 100644 mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp create mode 100644 mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td index 3c5294ff14fc7..12ba5e9f11141 100644 --- a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td +++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td @@ -38,6 +38,17 @@ def ApplyVectorContractToPackedTypeDotProductPatternsOp : Op]> { + let description = [{ + Collect patterns to sink vector producer operations forward in a block to + place them immediately before their first use. + }]; + + let assemblyFormat = "attr-dict"; +} + #endif // X86VECTOR_TRANSFORM_OPS diff --git a/mlir/include/mlir/Dialect/X86Vector/Transforms.h b/mlir/include/mlir/Dialect/X86Vector/Transforms.h index fc46dff63c2b7..b9c9054f57890 100644 --- a/mlir/include/mlir/Dialect/X86Vector/Transforms.h +++ b/mlir/include/mlir/Dialect/X86Vector/Transforms.h @@ -91,6 +91,10 @@ void populateVectorContractToFMAPatterns(RewritePatternSet &patterns); void populateVectorContractToPackedTypeDotProductPatterns( RewritePatternSet &patterns); +// Performs forward scheduling of vector producer ops to minimize their live +// range by placing them at their earliest legal use site +void populateSinkVectorProducerOpsPatterns(RewritePatternSet &patterns); + //===----------------------------------------------------------------------===// /// Helpers extracted from: /// - clang/lib/Headers/avxintrin.h diff --git a/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp index 95db208207672..25772f2aa57f4 100644 --- a/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp +++ b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp @@ -32,6 +32,11 @@ void mlir::transform::ApplyVectorContractToPackedTypeDotProductPatternsOp:: x86vector::populateVectorContractToPackedTypeDotProductPatterns(patterns); } +void mlir::transform::ApplySinkVectorProducerOpsPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + x86vector::populateSinkVectorProducerOpsPatterns(patterns); +} + //===----------------------------------------------------------------------===// // Transform op registration //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt index 2cab50fb591c4..cc4d3cac0f7ea 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt @@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRX86VectorTransforms LegalizeForLLVMExport.cpp VectorContractToFMA.cpp VectorContractToPackedTypeDotProduct.cpp + SinkVectorProducerOps.cpp LINK_LIBS PUBLIC MLIRArithDialect diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp new file mode 100644 index 0000000000000..85cb18687a4fc --- /dev/null +++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp @@ -0,0 +1,87 @@ +//===- SinkVectorProducerOps.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Utils/VectorUtils.h" +#include "mlir/Dialect/X86Vector/Transforms.h" +#include "mlir/Dialect/X86Vector/X86VectorDialect.h" + +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/PatternMatch.h" + +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +using namespace mlir; +using namespace mlir::vector; +using namespace mlir::x86vector; + +/// Sink vector producers forward to reduce live ranges. +/// This pattern applies to ops such as vector.load and vector.transfer_read. +template +struct SinkVectorProducerOps final : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(producerOp op, + PatternRewriter &rewriter) const override { + + // Collect all users of the producer op. + llvm::SmallVector users; + for (OpResult result : op->getResults()) + for (Operation *user : result.getUsers()) + users.push_back(user); + + // If there are no users, nothing to sink. + if (users.empty()) + return failure(); + + // If the next op is already a user, do not move. + Operation *nextOp = op->getNextNode(); + if (llvm::is_contained(users, nextOp)) + return failure(); + + // Prevent pathological looping: + // If the next op produces values used by any of op's users, don't move. + llvm::SmallVector nextOpUsers; + for (OpResult result : nextOp->getResults()) + for (Operation *user : result.getUsers()) + nextOpUsers.push_back(user); + if (llvm::any_of(users, [&](Operation *x) { + return llvm::is_contained(nextOpUsers, x); + })) { + return failure(); + } + + // Find the nearest user by scanning forward. + while (nextOp) { + if (llvm::is_contained(users, nextOp)) + break; + + nextOp = nextOp->getNextNode(); + } + + if (!nextOp) + return failure(); + + // // Both ops must be in the same block to safely move. + if (op->getBlock() != nextOp->getBlock()) + return failure(); + + // Move producer immediately before its first user. + op->moveBefore(nextOp); + + return success(); + } +}; + +void x86vector::populateSinkVectorProducerOpsPatterns( + RewritePatternSet &patterns) { + patterns.add, + SinkVectorProducerOps>(patterns.getContext()); +} diff --git a/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir new file mode 100644 index 0000000000000..04045b05bda49 --- /dev/null +++ b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir @@ -0,0 +1,154 @@ +// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s + +func.func @sink_vector_loads(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %0 = vector.load %arg0[%c0, %c0] : memref<16x16xf32>, vector<8xf32> + %1 = vector.load %arg0[%c0, %c8] : memref<16x16xf32>, vector<8xf32> + %2 = vector.load %arg0[%c8, %c0] : memref<16x16xf32>, vector<8xf32> + %3 = vector.load %arg0[%c8, %c8] : memref<16x16xf32>, vector<8xf32> + %4 = vector.fma %0, %1, %arg1 : vector<8xf32> + %5 = vector.fma %2, %3, %4 : vector<8xf32> + return %5 : vector<8xf32> +} + +// CHECK-LABEL: @sink_vector_loads +// CHECK: vector.load +// CHECK-NEXT: vector.load +// CHECK-NEXT: vector.fma +// CHECK-NEXT: vector.load +// CHECK-NEXT: vector.load +// CHECK-NEXT: vector.fma + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.x86vector.sink_vector_producer_ops + } : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @sink_vector_transfer_reads(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %0 = ub.poison : f32 + %1 = vector.transfer_read %arg0[%c0, %c0], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32> + %2 = vector.transfer_read %arg0[%c0, %c8], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32> + %3 = vector.transfer_read %arg0[%c8, %c0], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32> + %4 = vector.transfer_read %arg0[%c8, %c8], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32> + %5 = vector.fma %1, %2, %arg1 : vector<8xf32> + %6 = vector.fma %3, %4, %5 : vector<8xf32> + return %6 : vector<8xf32> +} + +// CHECK-LABEL: @sink_vector_transfer_reads +// CHECK: vector.transfer_read +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.fma +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.fma + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.x86vector.sink_vector_producer_ops + } : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @sink_vector_transfer_reads_tensor(%arg0: tensor<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %0 = ub.poison : f32 + %1 = vector.transfer_read %arg0[%c0, %c0], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32> + %2 = vector.transfer_read %arg0[%c0, %c8], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32> + %3 = vector.transfer_read %arg0[%c8, %c0], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32> + %4 = vector.transfer_read %arg0[%c8, %c8], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32> + %5 = vector.fma %1, %2, %arg1 : vector<8xf32> + %6 = vector.fma %3, %4, %5 : vector<8xf32> + return %6 : vector<8xf32> +} + +// CHECK-LABEL: @sink_vector_transfer_reads_tensor +// CHECK: vector.transfer_read +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.fma +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.fma + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.x86vector.sink_vector_producer_ops + } : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @negative_no_infinite_looping(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %0 = vector.load %arg0[%c0, %c0] : memref<16x16xf32>, vector<8xf32> + %1 = vector.load %arg0[%c0, %c8] : memref<16x16xf32>, vector<8xf32> + %2 = vector.fma %0, %1, %arg1 : vector<8xf32> + return %2: vector<8xf32> +} + +// CHECK-LABEL: @negative_no_infinite_looping +// CHECK: vector.load +// CHECK-NEXT: vector.load +// CHECK-NEXT: vector.fma + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.x86vector.sink_vector_producer_ops + } : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @negative_no_sink_outside_block(%arg0: memref<8x16xf32>, %arg1: i1) -> vector<8xf32> { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<8xf32> + %1 = vector.load %arg0[%c0, %c8] : memref<8x16xf32>, vector<8xf32> + %2 = scf.if %arg1 -> (vector<8xf32>) { + scf.yield %0 : vector<8xf32> + } else { + scf.yield %1 : vector<8xf32> + } + return %2 : vector<8xf32> +} + +// CHECK-LABEL: @negative_no_sink_outside_block +// CHECK: vector.load +// CHECK-NEXT: vector.load +// CHECK-NEXT: scf.if +// CHECK-NEXT: scf.yield + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.x86vector.sink_vector_producer_ops + } : !transform.any_op + transform.yield + } +} From e3e18a74d387b8bb789eb0fb723586d5d308388f Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Mon, 24 Nov 2025 07:34:18 -0800 Subject: [PATCH 2/4] added a bf16 test-case --- .../Transforms/SinkVectorProducerOps.cpp | 15 +++++-- .../X86Vector/sink-vector-producer-ops.mlir | 45 +++++++++++++++++++ 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp index 85cb18687a4fc..eb60e3e21d515 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp @@ -52,12 +52,19 @@ struct SinkVectorProducerOps final : public OpRewritePattern { for (OpResult result : nextOp->getResults()) for (Operation *user : result.getUsers()) nextOpUsers.push_back(user); - if (llvm::any_of(users, [&](Operation *x) { - return llvm::is_contained(nextOpUsers, x); - })) { - return failure(); + + Operation *nextFirstUser = nextOp->getNextNode(); + while (nextFirstUser) { + if (llvm::is_contained(nextOpUsers, nextFirstUser)) + break; + + nextFirstUser = nextFirstUser->getNextNode(); } + if (llvm::is_contained(users, nextFirstUser)) + return failure(); + + // Find the nearest user by scanning forward. while (nextOp) { if (llvm::is_contained(users, nextOp)) diff --git a/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir index 04045b05bda49..11af315e69e66 100644 --- a/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir +++ b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir @@ -98,6 +98,50 @@ module attributes {transform.with_named_sequence} { // ----- +#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)> +#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)> +#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)> + +func.func @sink_vector_transfer_reads_bf16(%arg0: tensor<4x64x32x2xbf16>, %arg1: tensor<4x32x64x2xbf16>, %arg2: vector<1x16xf32>) -> vector<1x16xf32> { + %0 = ub.poison : bf16 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %extracted_slice = tensor.extract_slice %arg0[%c0, %c0, %c0, 0] [1, 4, 1, 2] [1, 1, 1, 1] : tensor<4x64x32x2xbf16> to tensor<1x4x1x2xbf16> + %extracted_slice_0 = tensor.extract_slice %arg1[%c0, %c0, %c0, 0] [1, 1, 32, 2] [1, 1, 1, 1] : tensor<4x32x64x2xbf16> to tensor<1x1x32x2xbf16> + %1 = vector.transfer_read %extracted_slice[%c0, %c0, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x2xbf16>, vector<1x1x1x2xbf16> + %2 = vector.transfer_read %extracted_slice[%c0, %c1, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x2xbf16>, vector<1x1x1x2xbf16> + %3 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x1x32x2xbf16>, vector<1x1x16x2xbf16> + %4 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c16, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x1x32x2xbf16>, vector<1x1x16x2xbf16> + %5 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %3, %arg2 {unroll_shape = array} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32> + %6 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %1, %4, %5 {unroll_shape = array} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32> + %7 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %2, %3, %6 {unroll_shape = array} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32> + %8 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind} %2, %4, %7 {unroll_shape = array} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32> + return %8 : vector<1x16xf32> +} + +// CHECK-LABEL: @sink_vector_transfer_reads_bf16 +// CHECK: vector.transfer_read +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.contract +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.contract +// CHECK-NEXT: vector.transfer_read +// CHECK-NEXT: vector.contract +// CHECK-NEXT: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.x86vector.sink_vector_producer_ops + } : !transform.any_op + transform.yield + } +} + +// ----- + func.func @negative_no_infinite_looping(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> { %c0 = arith.constant 0 : index %c8 = arith.constant 8 : index @@ -152,3 +196,4 @@ module attributes {transform.with_named_sequence} { transform.yield } } + From 0a1f4233c4363f5f5ee4928629e198a23e0a8e6a Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Mon, 24 Nov 2025 07:38:11 -0800 Subject: [PATCH 3/4] fic clang-format errors --- mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp index eb60e3e21d515..b31636958e158 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp @@ -64,7 +64,6 @@ struct SinkVectorProducerOps final : public OpRewritePattern { if (llvm::is_contained(users, nextFirstUser)) return failure(); - // Find the nearest user by scanning forward. while (nextOp) { if (llvm::is_contained(users, nextOp)) From 2bd09d8ba1feedec339d94cb1e727058f209de3f Mon Sep 17 00:00:00 2001 From: Arun Thangamani Date: Mon, 24 Nov 2025 20:12:26 -0800 Subject: [PATCH 4/4] validate based on first user --- .../X86Vector/Transforms/SinkVectorProducerOps.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp index b31636958e158..95d970c404b70 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp @@ -61,9 +61,6 @@ struct SinkVectorProducerOps final : public OpRewritePattern { nextFirstUser = nextFirstUser->getNextNode(); } - if (llvm::is_contained(users, nextFirstUser)) - return failure(); - // Find the nearest user by scanning forward. while (nextOp) { if (llvm::is_contained(users, nextOp)) @@ -75,7 +72,12 @@ struct SinkVectorProducerOps final : public OpRewritePattern { if (!nextOp) return failure(); - // // Both ops must be in the same block to safely move. + // The Op first user and next Op first user are same. Break here to + // to avoid the shift cycle looping. + if (nextOp == nextFirstUser) + return failure(); + + // Both ops must be in the same block to safely move. if (op->getBlock() != nextOp->getBlock()) return failure();