-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[MLIR] Improve in-place folding to iterate until fixed-point #160615
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-adt @llvm/pr-subscribers-mlir Author: Mehdi Amini (joker-eph) ChangesWhen executed in the context of canonicalization, the folders are invoked in a fixed-point iterative process. However in the context of an API like // X = 0 + Y should fold to %Y, but the process actually involves first the folder provided by the IsCommutative trait to move the constant to the right. However this happens after attempting to fold the operation and the operation folder isn't attempt again after applying the trait folder. This commit makes sure we iterate until fixed point on folder applications. Full diff: https://github.com/llvm/llvm-project/pull/160615.diff 6 Files Affected:
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index 3d366276b4375..6373b1ead94d1 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -14,6 +14,7 @@
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/Matchers.h"
#include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/DebugLog.h"
using namespace mlir;
@@ -489,6 +490,12 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value> &results,
if (failed(op->fold(foldResults)))
return cleanupFailure();
+ int count = 0;
+ do {
+ LDBG() << "Folded in place #" << count++
+ << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
+ } while (foldResults.empty() && succeeded(op->fold(foldResults)));
+
// An in-place fold does not require generation of any constants.
if (foldResults.empty())
return success();
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index 5e07509871ea2..70310685315c4 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -16,6 +16,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/Operation.h"
+#include "llvm/Support/DebugLog.h"
using namespace mlir;
@@ -226,8 +227,15 @@ bool OperationFolder::isFolderOwnedConstant(Operation *op) const {
LogicalResult OperationFolder::tryToFold(Operation *op,
SmallVectorImpl<Value> &results) {
SmallVector<OpFoldResult, 8> foldResults;
- if (failed(op->fold(foldResults)) ||
- failed(processFoldResults(op, results, foldResults)))
+ if (failed(op->fold(foldResults)))
+ return failure();
+ int count = 1;
+ do {
+ LDBG() << "Folded in place #" << count++
+ << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
+ } while (foldResults.empty() && succeeded(op->fold(foldResults)));
+
+ if (failed(processFoldResults(op, results, foldResults)))
return failure();
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index 547c7355e00c6..b73bc69393dab 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -7,10 +7,8 @@ gpu.module @test {
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
//CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
- //CHECK: [[c0:%.+]] = arith.constant 0 : index
- //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK: [[c128:%.+]] = arith.constant 128 : index
- //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -23,10 +21,8 @@ gpu.module @test {
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
//CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
- //CHECK: [[c0:%.+]] = arith.constant 0 : index
- //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK: [[c128:%.+]] = arith.constant 128 : index
- //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index e5cc65e6bd3d7..d2d250cbe0f66 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -27,12 +27,10 @@ gpu.module @test_round_robin_assignment {
//CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
//CHECK: [[C128:%.+]] = arith.constant 128 : index
- //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
+ //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
//CHECK: [[C64_2:%.+]] = arith.constant 64 : index
- //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
+ //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
//CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 3478a9b91da5f..31d015d89a2dc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -325,12 +325,10 @@ gpu.module @test_distribution {
//CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
//CHECK: [[c0:%.+]] = arith.constant 0 : index
//CHECK: [[c0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[l_off_y_0:%.+]] = arith.addi [[l_off_y]], [[c0]] : index
- //CHECK: [[l_off_x_0:%.+]] = arith.addi [[l_off_x]], [[c0_1]] : index
//CHECK: [[c64:%.+]] = arith.constant 64 : index
- //CHECK: [[off_y:%.+]] = index.remu [[l_off_y_0]], [[c64]]
+ //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
//CHECK: [[c128:%.+]] = arith.constant 128 : index
- //CHECK: [[off_x:%.+]] = index.remu [[l_off_x_0]], [[c128]]
+ //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
//CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
%0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
%1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
@@ -349,13 +347,11 @@ gpu.module @test_distribution {
//CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
//CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
- //CHECK: [[l_off_y_0:%.+]] = index.mul [[id_y]], [[c32]]
+ //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
//CHECK: [[c32_1:%.+]] = arith.constant 32 : index
- //CHECK: [[l_off_x_0:%.+]] = index.mul [[id_x]], [[c32_1]]
+ //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
//CHECK: [[c0:%.+]] = arith.constant 0 : index
//CHECK: [[c0_2:%.+]] = arith.constant 0 : index
- //CHECK: [[l_off_y:%.+]] = arith.addi [[l_off_y_0]], [[c0]] : index
- //CHECK: [[l_off_x:%.+]] = arith.addi [[l_off_x_0]], [[c0_2]] : index
//CHECK: [[c64:%.+]] = arith.constant 64 : index
//CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
//CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -372,11 +368,10 @@ gpu.module @test_distribution {
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
//CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
//CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
- //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+ //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
//CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
- //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
- //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
//CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -390,9 +385,8 @@ gpu.module @test_distribution {
//CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
//CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
//CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
- //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
- //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
//CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
//CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index c0fb373835e3d..e83229e3a3995 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -14,12 +14,10 @@ gpu.module @test_1_1_assignment {
//CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
//CHECK: [[C256:%.+]] = arith.constant 256 : index
- //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
//CHECK: [[C128:%.+]] = arith.constant 128 : index
- //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
//CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -37,17 +35,13 @@ gpu.module @test_1_1_assignment {
//CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_2:%.+]] = arith.constant 0 : index
- //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index
//CHECK: [[C256:%.+]] = arith.constant 256 : index
- //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
//CHECK: [[C128:%.+]] = arith.constant 128 : index
- //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
//CHECK: [[C0_3:%.+]] = arith.constant 0 : index
- //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]]
//CHECK: [[C0_4:%.+]] = arith.constant 0 : index
- //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]]
- //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
|
@llvm/pr-subscribers-mlir-gpu Author: Mehdi Amini (joker-eph) ChangesWhen executed in the context of canonicalization, the folders are invoked in a fixed-point iterative process. However in the context of an API like // X = 0 + Y should fold to %Y, but the process actually involves first the folder provided by the IsCommutative trait to move the constant to the right. However this happens after attempting to fold the operation and the operation folder isn't attempt again after applying the trait folder. This commit makes sure we iterate until fixed point on folder applications. Full diff: https://github.com/llvm/llvm-project/pull/160615.diff 6 Files Affected:
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index 3d366276b4375..6373b1ead94d1 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -14,6 +14,7 @@
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/Matchers.h"
#include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/DebugLog.h"
using namespace mlir;
@@ -489,6 +490,12 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value> &results,
if (failed(op->fold(foldResults)))
return cleanupFailure();
+ int count = 0;
+ do {
+ LDBG() << "Folded in place #" << count++
+ << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
+ } while (foldResults.empty() && succeeded(op->fold(foldResults)));
+
// An in-place fold does not require generation of any constants.
if (foldResults.empty())
return success();
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index 5e07509871ea2..70310685315c4 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -16,6 +16,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/Operation.h"
+#include "llvm/Support/DebugLog.h"
using namespace mlir;
@@ -226,8 +227,15 @@ bool OperationFolder::isFolderOwnedConstant(Operation *op) const {
LogicalResult OperationFolder::tryToFold(Operation *op,
SmallVectorImpl<Value> &results) {
SmallVector<OpFoldResult, 8> foldResults;
- if (failed(op->fold(foldResults)) ||
- failed(processFoldResults(op, results, foldResults)))
+ if (failed(op->fold(foldResults)))
+ return failure();
+ int count = 1;
+ do {
+ LDBG() << "Folded in place #" << count++
+ << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
+ } while (foldResults.empty() && succeeded(op->fold(foldResults)));
+
+ if (failed(processFoldResults(op, results, foldResults)))
return failure();
return success();
}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index 547c7355e00c6..b73bc69393dab 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -7,10 +7,8 @@ gpu.module @test {
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
//CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
- //CHECK: [[c0:%.+]] = arith.constant 0 : index
- //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK: [[c128:%.+]] = arith.constant 128 : index
- //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -23,10 +21,8 @@ gpu.module @test {
//CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
//CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
- //CHECK: [[c0:%.+]] = arith.constant 0 : index
- //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK: [[c128:%.+]] = arith.constant 128 : index
- //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
//CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index e5cc65e6bd3d7..d2d250cbe0f66 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -27,12 +27,10 @@ gpu.module @test_round_robin_assignment {
//CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
//CHECK: [[C128:%.+]] = arith.constant 128 : index
- //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]]
+ //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
//CHECK: [[C64_2:%.+]] = arith.constant 64 : index
- //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]]
+ //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
//CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 3478a9b91da5f..31d015d89a2dc 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -325,12 +325,10 @@ gpu.module @test_distribution {
//CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
//CHECK: [[c0:%.+]] = arith.constant 0 : index
//CHECK: [[c0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[l_off_y_0:%.+]] = arith.addi [[l_off_y]], [[c0]] : index
- //CHECK: [[l_off_x_0:%.+]] = arith.addi [[l_off_x]], [[c0_1]] : index
//CHECK: [[c64:%.+]] = arith.constant 64 : index
- //CHECK: [[off_y:%.+]] = index.remu [[l_off_y_0]], [[c64]]
+ //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
//CHECK: [[c128:%.+]] = arith.constant 128 : index
- //CHECK: [[off_x:%.+]] = index.remu [[l_off_x_0]], [[c128]]
+ //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]]
//CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32>
%0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
%1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32], lane_layout = [2, 8], lane_data = [1, 1]>}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32>
@@ -349,13 +347,11 @@ gpu.module @test_distribution {
//CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
//CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
//CHECK: [[c32:%.+]] = arith.constant 32 : index
- //CHECK: [[l_off_y_0:%.+]] = index.mul [[id_y]], [[c32]]
+ //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
//CHECK: [[c32_1:%.+]] = arith.constant 32 : index
- //CHECK: [[l_off_x_0:%.+]] = index.mul [[id_x]], [[c32_1]]
+ //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
//CHECK: [[c0:%.+]] = arith.constant 0 : index
//CHECK: [[c0_2:%.+]] = arith.constant 0 : index
- //CHECK: [[l_off_y:%.+]] = arith.addi [[l_off_y_0]], [[c0]] : index
- //CHECK: [[l_off_x:%.+]] = arith.addi [[l_off_x_0]], [[c0_2]] : index
//CHECK: [[c64:%.+]] = arith.constant 64 : index
//CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
//CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -372,11 +368,10 @@ gpu.module @test_distribution {
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
//CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
//CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
- //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
+ //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
//CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
- //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
- //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
//CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
//CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
@@ -390,9 +385,8 @@ gpu.module @test_distribution {
//CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
//CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
//CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
- //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index
//CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
- //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]]
+ //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
//CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
//CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
//CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index c0fb373835e3d..e83229e3a3995 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -14,12 +14,10 @@ gpu.module @test_1_1_assignment {
//CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_1:%.+]] = arith.constant 0 : index
- //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index
//CHECK: [[C256:%.+]] = arith.constant 256 : index
- //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
//CHECK: [[C128:%.+]] = arith.constant 128 : index
- //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
//CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -37,17 +35,13 @@ gpu.module @test_1_1_assignment {
//CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
//CHECK: [[C0:%.+]] = arith.constant 0 : index
//CHECK: [[C0_2:%.+]] = arith.constant 0 : index
- //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index
- //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index
//CHECK: [[C256:%.+]] = arith.constant 256 : index
- //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]]
+ //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
//CHECK: [[C128:%.+]] = arith.constant 128 : index
- //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]]
+ //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
//CHECK: [[C0_3:%.+]] = arith.constant 0 : index
- //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]]
//CHECK: [[C0_4:%.+]] = arith.constant 0 : index
- //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]]
- //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+ //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
%tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
-> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
gpu.return
|
a65f6d2
to
d7d8fd0
Compare
d7d8fd0
to
64d1ed6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This makes sense to me. Do you have some statistics on the usual number of iterations and if this adds measurable overhead?
This necessary adds some overhead, I don't have stats though (it's gonna be heavily dependent on the setup: how the folders are structured and what the input IR looks like). For the number of iterations: should be 1/2 in general, but that's assuming that folders are well written (which we would like to because this is already iterative in canonicalization). |
64d1ed6
to
27d1d42
Compare
I can cherry pick this and check by compiling something like llama/SDXL in IREE. I should be able to find time for this in the next couple of days. |
27d1d42
to
bfd7aff
Compare
FYI I added assertions for |
Actually it was vector.extract, but in the context of legalizing to spirv:
Seems like first it folds a dynamic to static, then it folds again by looking through the operand somehow? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change makes sense to me.
The same strategy could potentially be used in the greedy pattern rewrite driver to improve efficiency: we could try to fold the op in a loop until no more folding is possible. This may be more efficient than pushing/popping the op onto the worklist. (We still have to put the op on the worklist after the last successful folding, so it's unclear if this actually improves compilation time.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice! I always just assumed this happened for some reason.
Nice!
I'm curious, how could we verify now that a particular folder fully applies in one iteration? As you know, we previously had ordering issues in the |
63ded52
to
4232f1c
Compare
When executed in the context of canonicalization, the folders are invoked in a fixed-point iterative process. However in the context of an API like `createOrFold()` or in DialectConversion for example, we expect a "one-shot" call to fold to be as "folded" as possible. However, even when folders themselves are indempotent, folders on a given operation interact with each other. For example: // X = 0 + Y %X = arith.addi %c_0, %Y : i32 should fold to %Y, but the process actually involves first the folder provided by the IsCommutative trait to move the constant to the right. However this happens after attempting to fold the operation and the operation folder isn't attempt again after applying the trait folder. This commit makes sure we iterate until fixed point on folder applications.
4232f1c
to
470fec0
Compare
@dcaballe : I added a "maxIterations" to the API, and an option to the test pass. |
@kuhar just a reminder on this, this isn't urgent but I wouldn't want to this get forgotten either. Thanks! |
Cherry picked this and tested on SDXL in IREE. Some stats:
So average number of iterations is roughly 1. No calls to the one in No meaningful increase in total compilation time 😄. |
…lvm#160615)" This reverts commit fcf79e5.
Carrying revert for 9690a718b. Adds revert for llvm/llvm-project#160615 due to infinite loop error Skips generating test which segfaults during test generation on msvc. --------- Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com> Signed-off-by: Erick Ochoa <erick.ochoalopez@amd.com> Co-authored-by: Nirvedh Meshram <nirvedh@gmail.com>
…lvm#160615)" This reverts commit fcf79e5.
@joker-eph I've encountered some problems when building IREE with this commit. But it is an interesting error. I have a relatively large program which undergoes the iree-convert-to-spirv pass. Most of the dialects in this program are upstream MLIR dialects with the exception of the HAL dialect. I have already ran this through iree-reduce (although the program is still large). When I run the pass, with the input in question, an invalid IR is printed to stderr without any error messages. The invalid IR is long and and contains the following errors:
However, the program succeeds and produces valid IR at the end. I'm trying to narrow down the problem a bit more, but hoping to get a discussion going on. I've been following the discussion on #161145 . And applying that as a patch does not fix the issue. |
Operation *parent = op->getParentOp(); | ||
while (parent && parent->getName().getStringRef() != "spirv.func") | ||
parent = parent->getParentOp(); | ||
if (parent) | ||
parent->dump(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uh... clearly!! (give me a min)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in e84dcba ; thanks for the report!!
This was incorrectly left and merged with #160615
…lvm#160615)" This reverts commit fcf79e5.
…0615) When executed in the context of canonicalization, the folders are invoked in a fixed-point iterative process. However in the context of an API like `createOrFold()` or in DialectConversion for example, we expect a "one-shot" call to fold to be as "folded" as possible. However, even when folders themselves are indempotent, folders on a given operation interact with each other. For example: ``` // X = 0 + Y %X = arith.addi %c_0, %Y : i32 ``` should fold to %Y, but the process actually involves first the folder provided by the IsCommutative trait to move the constant to the right. However this happens after attempting to fold the operation and the operation folder isn't attempt again after applying the trait folder. This commit makes sure we iterate until fixed point on folder applications. Fixes llvm#159844
This was incorrectly left and merged with llvm#160615
…lvm#160615)" This reverts commit fcf79e5.
…lvm#160615)" This reverts commit fcf79e5.
…lvm#160615)" This reverts commit fcf79e5.
When executed in the context of canonicalization, the folders are invoked in a fixed-point iterative process. However in the context of an API like
createOrFold()
or in DialectConversion for example, we expect a "one-shot" call to fold to be as "folded" as possible. However, even when folders themselves are indempotent, folders on a given operation interact with each other. For example:// X = 0 + Y
%X = arith.addi %c_0, %Y : i32
should fold to %Y, but the process actually involves first the folder provided by the IsCommutative trait to move the constant to the right. However this happens after attempting to fold the operation and the operation folder isn't attempt again after applying the trait folder.
This commit makes sure we iterate until fixed point on folder applications.
Fixes #159844