diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h index 2e7a6fe3e362c..ee89e8d0e7c3f 100644 --- a/mlir/include/mlir/Transforms/FoldUtils.h +++ b/mlir/include/mlir/Transforms/FoldUtils.h @@ -40,7 +40,10 @@ class OperationFolder { /// deduplicated constants. If successful, replaces `op`'s uses with /// folded results, and returns success. If the op was completely folded it is /// erased. If it is just updated in place, `inPlaceUpdate` is set to true. - LogicalResult tryToFold(Operation *op, bool *inPlaceUpdate = nullptr); + /// On success() and when in-place, the folder is invoked until + /// `maxIterations` is reached (default INT_MAX). + LogicalResult tryToFold(Operation *op, bool *inPlaceUpdate = nullptr, + int maxIterations = INT_MAX); /// Tries to fold a pre-existing constant operation. `constValue` represents /// the value of the constant, and can be optionally passed if the value is @@ -82,7 +85,10 @@ class OperationFolder { /// Tries to perform folding on the given `op`. If successful, populates /// `results` with the results of the folding. - LogicalResult tryToFold(Operation *op, SmallVectorImpl &results); + /// On success() and when in-place, the folder is invoked until + /// `maxIterations` is reached (default INT_MAX). + LogicalResult tryToFold(Operation *op, SmallVectorImpl &results, + int maxIterations = INT_MAX); /// Try to process a set of fold results. Populates `results` on success, /// otherwise leaves it unchanged. diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp index 3d366276b4375..c84e760a3f363 100644 --- a/mlir/lib/IR/Builders.cpp +++ b/mlir/lib/IR/Builders.cpp @@ -14,6 +14,7 @@ #include "mlir/IR/IRMapping.h" #include "mlir/IR/Matchers.h" #include "llvm/ADT/SmallVectorExtras.h" +#include "llvm/Support/DebugLog.h" using namespace mlir; @@ -486,9 +487,25 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl &results, // Try to fold the operation. SmallVector foldResults; + LDBG() << "Trying to fold: " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); + if (op->getName().getStringRef() == "vector.extract") { + Operation *parent = op->getParentOp(); + while (parent && parent->getName().getStringRef() != "spirv.func") + parent = parent->getParentOp(); + if (parent) + parent->dump(); + } if (failed(op->fold(foldResults))) return cleanupFailure(); + int count = 0; + do { + LDBG() << "Folded in place #" << count + << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions()); + count++; + } while (foldResults.empty() && succeeded(op->fold(foldResults))); + // An in-place fold does not require generation of any constants. if (foldResults.empty()) return success(); diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp index 5e07509871ea2..68ad3acf295c8 100644 --- a/mlir/lib/Transforms/Utils/FoldUtils.cpp +++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp @@ -16,6 +16,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/Operation.h" +#include "llvm/Support/DebugLog.h" using namespace mlir; @@ -67,7 +68,8 @@ static Operation *materializeConstant(Dialect *dialect, OpBuilder &builder, // OperationFolder //===----------------------------------------------------------------------===// -LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) { +LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate, + int maxIterations) { if (inPlaceUpdate) *inPlaceUpdate = false; @@ -86,7 +88,7 @@ LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) { // Try to fold the operation. SmallVector results; - if (failed(tryToFold(op, results))) + if (failed(tryToFold(op, results, maxIterations))) return failure(); // Check to see if the operation was just updated in place. @@ -224,10 +226,19 @@ bool OperationFolder::isFolderOwnedConstant(Operation *op) const { /// Tries to perform folding on the given `op`. If successful, populates /// `results` with the results of the folding. LogicalResult OperationFolder::tryToFold(Operation *op, - SmallVectorImpl &results) { + SmallVectorImpl &results, + int maxIterations) { SmallVector foldResults; - if (failed(op->fold(foldResults)) || - failed(processFoldResults(op, results, foldResults))) + if (failed(op->fold(foldResults))) + return failure(); + int count = 1; + do { + LDBG() << "Folded in place #" << count + << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions()); + } while (count++ < maxIterations && foldResults.empty() && + succeeded(op->fold(foldResults))); + + if (failed(processFoldResults(op, results, foldResults))) return failure(); return success(); } diff --git a/mlir/test/Dialect/Arith/constant-fold.mlir b/mlir/test/Dialect/Arith/constant-fold.mlir new file mode 100644 index 0000000000000..172945fafdaf3 --- /dev/null +++ b/mlir/test/Dialect/Arith/constant-fold.mlir @@ -0,0 +1,18 @@ +// Test with the default (one application of the folder) and then with 2 iterations. +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-single-fold))" | FileCheck %s --check-prefixes=CHECK,CHECK-ONE +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-single-fold{max-iterations=2}))" | FileCheck %s --check-prefixes=CHECK,CHECK-TWO + + +// Folding entirely this requires to move the constant to the right +// before invoking the op-specific folder. +// With one iteration, we just push the constant to the right. +// With a second iteration, we actually fold the "add" (x+0->x) +// CHECK: func @recurse_fold_traits(%[[ARG0:.*]]: i32) +func.func @recurse_fold_traits(%arg0 : i32) -> i32 { + %cst0 = arith.constant 0 : i32 +// CHECK-ONE: %[[ADD:.*]] = arith.addi %[[ARG0]], + %res = arith.addi %cst0, %arg0 : i32 +// CHECK-ONE: return %[[ADD]] : i32 +// CHECK-TWO: return %[[ARG0]] : i32 + return %res : i32 +} diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir index 547c7355e00c6..b73bc69393dab 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir @@ -7,10 +7,8 @@ gpu.module @test { //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] //CHECK: [[c32:%.+]] = arith.constant 32 : index //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> @@ -23,10 +21,8 @@ gpu.module @test { //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] //CHECK: [[c32:%.+]] = arith.constant 32 : index //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index e5cc65e6bd3d7..d2d250cbe0f66 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -27,12 +27,10 @@ gpu.module @test_round_robin_assignment { //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]] //CHECK: [[C0:%.+]] = arith.constant 0 : index //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index - //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]] + //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]] //CHECK: [[C64_2:%.+]] = arith.constant 64 : index - //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]] + //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]] //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 48fc633974e63..03c63861705d9 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -325,12 +325,10 @@ gpu.module @test_distribution { //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[c0_1:%.+]] = arith.constant 0 : index - //CHECK: [[l_off_y_0:%.+]] = arith.addi [[l_off_y]], [[c0]] : index - //CHECK: [[l_off_x_0:%.+]] = arith.addi [[l_off_x]], [[c0_1]] : index //CHECK: [[c64:%.+]] = arith.constant 64 : index - //CHECK: [[off_y:%.+]] = index.remu [[l_off_y_0]], [[c64]] + //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[off_x:%.+]] = index.remu [[l_off_x_0]], [[c128]] + //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]] //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> @@ -349,13 +347,11 @@ gpu.module @test_distribution { //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]] //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]] //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y_0:%.+]] = index.mul [[id_y]], [[c32]] + //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]] //CHECK: [[c32_1:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x_0:%.+]] = index.mul [[id_x]], [[c32_1]] + //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[c0_2:%.+]] = arith.constant 0 : index - //CHECK: [[l_off_y:%.+]] = arith.addi [[l_off_y_0]], [[c0]] : index - //CHECK: [[l_off_x:%.+]] = arith.addi [[l_off_x_0]], [[c0_2]] : index //CHECK: [[c64:%.+]] = arith.constant 64 : index //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index @@ -412,11 +408,10 @@ gpu.module @test_distribution { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]] //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index - //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] + //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]] //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]] //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> @@ -430,9 +425,8 @@ gpu.module @test_distribution { //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]] //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex> //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index c0fb373835e3d..e83229e3a3995 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -14,12 +14,10 @@ gpu.module @test_1_1_assignment { //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] //CHECK: [[C0:%.+]] = arith.constant 0 : index //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index - //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]] + //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]] //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]] + //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]] //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> @@ -37,17 +35,13 @@ gpu.module @test_1_1_assignment { //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] //CHECK: [[C0:%.+]] = arith.constant 0 : index //CHECK: [[C0_2:%.+]] = arith.constant 0 : index - //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index - //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]] + //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]] //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]] + //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]] //CHECK: [[C0_3:%.+]] = arith.constant 0 : index - //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]] //CHECK: [[C0_4:%.+]] = arith.constant 0 : index - //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]] - //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return diff --git a/mlir/test/lib/Transforms/TestSingleFold.cpp b/mlir/test/lib/Transforms/TestSingleFold.cpp index 5bd9dd2a1f075..e55f36aea0a7c 100644 --- a/mlir/test/lib/Transforms/TestSingleFold.cpp +++ b/mlir/test/lib/Transforms/TestSingleFold.cpp @@ -26,6 +26,9 @@ struct TestSingleFold : public PassWrapper>, public RewriterBase::Listener { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSingleFold) + TestSingleFold() = default; + TestSingleFold(const TestSingleFold &pass) : PassWrapper(pass) {} + StringRef getArgument() const final { return "test-single-fold"; } StringRef getDescription() const final { return "Test single-pass operation folding and dead constant elimination"; @@ -45,13 +48,18 @@ struct TestSingleFold : public PassWrapper>, if (it != existingConstants.end()) existingConstants.erase(it); } + + Option maxIterations{*this, "max-iterations", + llvm::cl::desc("Max iterations in the tryToFold"), + llvm::cl::init(1)}; }; } // namespace void TestSingleFold::foldOperation(Operation *op, OperationFolder &helper) { // Attempt to fold the specified operation, including handling unused or // duplicated constants. - (void)helper.tryToFold(op); + bool inPlaceUpdate = false; + (void)helper.tryToFold(op, &inPlaceUpdate, maxIterations); } void TestSingleFold::runOnOperation() {