diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index fb5d1e758dbd1..7ab2e612ed890 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -8,7 +8,6 @@ #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/Utils/Utils.h" -#include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" @@ -61,7 +60,7 @@ genCoordinates(OpBuilder &builder, Location loc, // Get the offset of `subShape` within a distribution unit. SmallVector distUnitLocalOffset = llvm::map_to_vector( llvm::zip(delinearizedId, subShape), [&](const auto &t) -> Value { - return builder.createOrFold( + return builder.createOrFold( loc, std::get<0>(t), builder.createOrFold(loc, std::get<1>(t))); }); @@ -84,7 +83,7 @@ genCoordinates(OpBuilder &builder, Location loc, // Do not go beyond `srcShape` bounds. SmallVector mods = llvm::map_to_vector( llvm::zip_equal(adds, srcShape), [&](const auto &t) -> Value { - return builder.createOrFold( + return builder.createOrFold( loc, std::get<0>(t), arith::ConstantIndexOp::create(builder, loc, std::get<1>(t))); }); @@ -343,7 +342,7 @@ LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { /// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within /// this dimension) result[dimIdx] = - builder.createOrFold(loc, remaining, dimSizeVal); + builder.createOrFold(loc, remaining, dimSizeVal); /// Update remaining for the next dimension by removing what we've already /// processed. Division tells us "how many complete groups of this dimension @@ -352,7 +351,7 @@ LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { /// no next dimension to process if (i < order.size() - 1) { remaining = - builder.createOrFold(loc, remaining, dimSizeVal); + builder.createOrFold(loc, remaining, dimSizeVal); } } return result; diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 91432b1c11304..9f126fe8c2415 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -12,7 +12,6 @@ #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/LLVMIR/XeVMDialect.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Utils/IndexingUtils.h" @@ -527,7 +526,7 @@ SmallVector xegpu::addElementwise(OpBuilder &builder, for (auto [l, r] : llvm::zip_equal(lhs, rhs)) { auto lval = getValueOrCreateConstantIndexOp(builder, loc, l); auto rval = getValueOrCreateConstantIndexOp(builder, loc, r); - results.push_back(builder.createOrFold(loc, lval, rval)); + results.push_back(builder.createOrFold(loc, lval, rval)); } return results; } diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 8fd3cca5594cb..22177f8f6a15f 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -271,11 +271,11 @@ gpu.module @xevm_module{ // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[C8:.*]] = arith.constant 8 : index // CHECK: %[[LANE_ID:.*]] = gpu.lane_id -// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]] -// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]] -// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]] -// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]] -// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]] +// CHECK: %[[REMU1:.*]] = arith.remui %[[LANE_ID]], %[[C8]] +// CHECK: %[[DIVU:.*]] = arith.divui %[[LANE_ID]], %[[C8]] +// CHECK: %[[REMU2:.*]] = arith.remui %[[DIVU]], %[[C2]] +// CHECK: %[[REMU3:.*]] = arith.remui %[[REMU2]], %[[C2]] +// CHECK: %[[REMU4:.*]] = arith.remui %[[REMU1]], %[[C8]] // CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32> // CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.module @xevm_module{ @@ -294,13 +294,13 @@ gpu.module @xevm_module{ // CHECK: %[[C4:.*]] = arith.constant 4 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[LANE_ID:.*]] = gpu.lane_id -// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]] -// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]] -// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]] -// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]] -// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]] -// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]] -// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]] +// CHECK: %[[REMU1:.*]] = arith.remui %[[LANE_ID]], %[[C4]] +// CHECK: %[[DIVU:.*]] = arith.divui %[[LANE_ID]], %[[C4]] +// CHECK: %[[REMU2:.*]] = arith.remui %[[DIVU]], %[[C4]] +// CHECK: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C2]] +// CHECK: %[[REMU3:.*]] = arith.remui %[[MUL]], %[[C8]] +// CHECK: %[[REMU4:.*]] = arith.remui %[[REMU1]], %[[C4]] +// CHECK: %[[ADD:.*]] = arith.addi %[[REMU4]], %[[C1]] // CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32> // CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index gpu.module @xevm_module{ diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir index 02c5f71d5c83d..8ce6d4dfd439e 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir @@ -3,10 +3,10 @@ gpu.module @test { gpu.func @slice_attr() -> vector<128xindex> { // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]] - // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]] - // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]] - // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]] + // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C8:.*]] + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[DIVU]], %[[C4:.*]] + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C32:.*]] + // CHECK-DAG: %[[MOD:.*]] = arith.remui %[[MUL]], %[[C128:.*]] // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex> // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex> // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex> @@ -16,11 +16,10 @@ gpu.module @test { gpu.func @nested_slice_attr() -> vector<128xindex> { // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]] - // CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]] - // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]] - // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]] - // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]] + // CHECK-DAG: %[[DIVU2:.*]] = arith.divui %[[SGID]], %[[C8:.*]] + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[DIVU2]], %[[C4:.*]] + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C32:.*]] + // CHECK-DAG: %[[MOD:.*]] = arith.remui %[[MUL]], %[[C128:.*]] // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex> // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex> // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex> @@ -29,4 +28,3 @@ gpu.module @test { } } - diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index 01134d8eaabec..4829af3612de3 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -16,18 +16,18 @@ gpu.module @test_round_robin_assignment { gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) { // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index // CHECK: %[[C4:.*]] = arith.constant 4 : index - // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]] - // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]] + // CHECK: %[[IDX:.*]] = arith.remui %[[SGID]], %[[C4]] + // CHECK: %[[IDY_DIV:.*]] = arith.divui %[[SGID]], %[[C4]] // CHECK: %[[C8:.*]] = arith.constant 8 : index - // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]] + // CHECK: %[[IDY:.*]] = arith.remui %[[IDY_DIV]], %[[C8]] // CHECK: %[[C16:.*]] = arith.constant 16 : index - // CHECK: %[[LY:.*]] = index.mul %[[IDY]], %[[C16]] + // CHECK: %[[LY:.*]] = arith.muli %[[IDY]], %[[C16]] // CHECK: %[[C64:.*]] = arith.constant 64 : index - // CHECK: %[[LX:.*]] = index.mul %[[IDX]], %[[C64]] + // CHECK: %[[LX:.*]] = arith.muli %[[IDX]], %[[C64]] // CHECK: %[[C128:.*]] = arith.constant 128 : index - // CHECK: %[[OFFY:.*]] = index.remu %[[LY]], %[[C128]] + // CHECK: %[[OFFY:.*]] = arith.remui %[[LY]], %[[C128]] // CHECK: %[[C64_1:.*]] = arith.constant 64 : index - // CHECK: %[[OFFX:.*]] = index.remu %[[LX]], %[[C64_1]] + // CHECK: %[[OFFX:.*]] = arith.remui %[[LX]], %[[C64_1]] // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index 1cddccb5fbbd1..eae51a16053d8 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -90,30 +90,27 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: non_splat_constant gpu.func @non_splat_constant() { - // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex> + // CHECK-DAG: %[[CST:.*]] = arith.constant dense<{{.*}}0{{.*}}, {{.*}}16{{.*}}> : vector<2x1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[SGID]], %[[C1:.*]] - // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C1:.*]] - // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C8:.*]] - // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2:.*]] - // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C32:.*]] - // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C1:.*]] - // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index - // CHECK-DAG: %[[REMU5:.*]] = index.remu %[[ADD16]], %[[C32:.*]] - // CHECK-DAG: %[[REMU6:.*]] = index.remu %[[REMU1]], %[[C1:.*]] - // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index - // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index - // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex> - // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex> - // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU5]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index - // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU6]], %[[C0:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index - // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex> - // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex> + // CHECK-DAG: %[[T1:.*]] = arith.remui %[[SGID]], %[[C8:.*]] : index + // CHECK-DAG: %[[T2:.*]] = arith.muli %[[T1]], %[[C2:.*]] : index + // CHECK-DAG: %[[T3:.*]] = arith.remui %[[T2]], %[[C32:.*]] : index + // CHECK-DAG: %[[T4:.*]] = arith.addi %[[T2]], %[[C16:.*]] : index + // CHECK-DAG: %[[T5:.*]] = arith.remui %[[T4]], %[[C32_6:.*]] : index + // CHECK-DAG: %[[T6:.*]] = arith.muli %[[T3]], %[[C16_10:.*]] : index + // CHECK-DAG: %[[T7:.*]] = arith.addi %[[C0_11:.*]], %[[T6]] : index + // CHECK-DAG: %[[T8:.*]] = arith.muli %[[C0_4:.*]], %[[C0_9:.*]] : index + // CHECK-DAG: %[[T9:.*]] = arith.addi %[[T7]], %[[T8]] : index + // CHECK-DAG: %[[T10:.*]] = vector.broadcast %[[T9]] : index to vector<2x1xindex> + // CHECK-DAG: %[[T11:.*]] = arith.addi %[[CST]], %[[T10]] : vector<2x1xindex> + // CHECK-DAG: %[[T12:.*]] = arith.muli %[[T5]], %[[C16_10:.*]] : index + // CHECK-DAG: %[[T13:.*]] = arith.addi %[[C0_12:.*]], %[[T12]] : index + // CHECK-DAG: %[[T14:.*]] = arith.muli %[[C0_8:.*]], %[[C0_9:.*]] : index + // CHECK-DAG: %[[T15:.*]] = arith.addi %[[T13]], %[[T14]] : index + // CHECK-DAG: %[[T16:.*]] = vector.broadcast %[[T15]] : index to vector<2x1xindex> + // CHECK-DAG: %[[T17:.*]] = arith.addi %[[CST]], %[[T16]] : vector<2x1xindex> %cst_2 = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> gpu.return } @@ -139,4 +136,3 @@ gpu.module @test_distribution { gpu.return } } - diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 574b365443a0a..98920d61c4f58 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -27,17 +27,17 @@ gpu.module @test_distribution { //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index - //CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4]] - //CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4]] + //CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4]] + //CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4]] //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index - //CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8]] + //CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8]] //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index - //CHECK-DAG: %[[L_OFF_Y:.*]] = index.mul %[[SGIDY]], %[[C32]] - //CHECK-DAG: %[[L_OFF_X:.*]] = index.mul %[[SGIDX]], %[[C32]] + //CHECK-DAG: %[[L_OFF_Y:.*]] = arith.muli %[[SGIDY]], %[[C32]] : index + //CHECK-DAG: %[[L_OFF_X:.*]] = arith.muli %[[SGIDX]], %[[C32_1:.*]] : index //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index - //CHECK-DAG: %[[OFF_Y:.*]] = index.remu %[[L_OFF_Y]], %[[C256]] + //CHECK-DAG: %[[OFF_Y:.*]] = arith.remui %[[L_OFF_Y]], %[[C256]] : index //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index - //CHECK-DAG: %[[OFF_X:.*]] = index.remu %[[L_OFF_X]], %[[C128]] + //CHECK-DAG: %[[OFF_X:.*]] = arith.remui %[[L_OFF_X]], %[[C128]] : index //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout> -> vector<32x32xf32> %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> @@ -293,7 +293,7 @@ gpu.module @test_distribution { %val = arith.constant {layout_result_0 = #xegpu.layout} dense<25.5> : vector<256xf16> %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout} dense<1> : vector<256xi1> - xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout, + xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout, layout_operand_2 = #xegpu.layout, layout_operand_3 = #xegpu.layout, l1_hint = #xegpu.cache_hint} @@ -321,18 +321,18 @@ gpu.module @test_distribution { //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]] - //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]] + //CHECK: [[sgidx:%.+]] = arith.remui [[sgid]], [[c4]] : index + //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgid]], [[c4]] : index //CHECK: [[c2:%.+]] = arith.constant 2 : index - //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]] + //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c2]] : index //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[l_off_y:%.+]] = arith.muli [[sgidy]], [[c32]] : index //CHECK: [[c32_0:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]] + //CHECK: [[l_off_x:%.+]] = arith.muli [[sgidx]], [[c32_0]] : index //CHECK: [[c64:%.+]] = arith.constant 64 : index - //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] + //CHECK: [[off_y:%.+]] = arith.remui [[l_off_y]], [[c64]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]] + //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> @@ -346,18 +346,18 @@ gpu.module @test_distribution { //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]] - //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]] + //CHECK: [[sgidx:%.+]] = arith.remui [[sgid]], [[c4]] : index + //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgid]], [[c4]] : index //CHECK: [[c2:%.+]] = arith.constant 2 : index - //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]] + //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c2]] : index //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[l_off_y:%.+]] = arith.muli [[sgidy]], [[c32]] : index //CHECK: [[c32_0:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]] + //CHECK: [[l_off_x:%.+]] = arith.muli [[sgidx]], [[c32_0]] : index //CHECK: [[c64:%.+]] = arith.constant 64 : index - //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] + //CHECK: [[off_y:%.+]] = arith.remui [[l_off_y]], [[c64]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]] + //CHECK: [[off_x:%.+]] = arith.remui [[l_off_x]], [[c128]] : index //CHECK: xegpu.store_matrix [[cst]], [[mdesc]][[[off_y]], [[off_x]]] : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32>, index, index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.0> : vector<64x128xf32> %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> @@ -409,14 +409,14 @@ gpu.module @test_distribution { gpu.func @vector_step_op_slice_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index //CHECK: [[c8:%.+]] = arith.constant 8 : index - //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c8]] - //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgId]], [[c8]] + //CHECK: [[sgidx:%.+]] = arith.remui [[sgId]], [[c8]] : index + //CHECK: [[sgidy_tmp:%.+]] = arith.divui [[sgId]], [[c8]] : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c4]] + //CHECK: [[sgidy:%.+]] = arith.remui [[sgidy_tmp]], [[c4]] : index //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[LY:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[LY:%.+]] = arith.muli [[sgidy]], [[c32]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[c128]] + //CHECK: [[MODY:%.+]] = arith.remui [[LY]], [[c128]] : index //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> @@ -427,11 +427,11 @@ gpu.module @test_distribution { gpu.func @vector_step_op_layout_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index //CHECK: [[c16:%.+]] = arith.constant 16 : index - //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c16]] + //CHECK: [[sgidx:%.+]] = arith.remui [[sgId]], [[c16]] : index //CHECK: [[c8:%.+]] = arith.constant 8 : index - //CHECK: [[LOCALY:%.+]] = index.mul [[sgidx]], [[c8]] + //CHECK: [[LOCALY:%.+]] = arith.muli [[sgidx]], [[c8]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] + //CHECK: [[MODY:%.+]] = arith.remui [[LOCALY]], [[c128]] : index //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex> //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> @@ -479,18 +479,15 @@ gpu.module @test_distribution { // CHECK-LABEL: non_splat_constant_2D gpu.func @non_splat_constant_2D() { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex> - // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}} - // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}} - // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}} - // CHECK-DAG: %[[IDY:.*]] = index.remu %[[SGIDY]], %{{.*}} - // CHECK-DAG: %[[IDX:.*]] = index.remu %[[SGIDX]], %{{.*}} - // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index - // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[STRIDEROW]] : index - // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1x1xindex> - // CHECK-DAG: arith.addi %[[CST]], %[[BCAST]] : vector<1x1xindex> + // CHECK-DAG: %[[T0:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[T1:.*]] = arith.remui %[[T0]], %[[C32:.*]] : index + // CHECK-DAG: %[[T2:.*]] = arith.remui %[[T1]], %[[C32_4:.*]] : index + // CHECK-DAG: %[[T3:.*]] = arith.muli %[[T2]], %[[C16:.*]] : index + // CHECK-DAG: %[[T4:.*]] = arith.addi %[[C0_8:.*]], %[[T3]] : index + // CHECK-DAG: %[[T5:.*]] = arith.muli %[[C0_6:.*]], %[[C0_7:.*]] : index + // CHECK-DAG: %[[T6:.*]] = arith.addi %[[T4]], %[[T5]] : index + // CHECK-DAG: %[[T7:.*]] = vector.broadcast %[[T6]] : index to vector<1x1xindex> + // CHECK-DAG: %[[T8:.*]] = arith.addi %[[CST]], %[[T7]] : vector<1x1xindex> %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> gpu.return } @@ -499,13 +496,13 @@ gpu.module @test_distribution { gpu.func @non_splat_constant_2D_non_unit_dim() { // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}} - // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}} - // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}} - // CHECK-DAG: %[[MULY:.*]] = index.mul %[[SGIDY]], %[[C2:.*]] - // CHECK-DAG: %[[MULX:.*]] = index.mul %[[SGIDX]], %{{.*}} - // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]] - // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %{{.*}} + // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %{{.*}} + // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[SGIDY]], %[[C2:.*]] : index + // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[SGIDX]], %{{.*}} : index + // CHECK-DAG: %[[REMU_Y:.*]] = arith.remui %[[MULY]], %[[C8:.*]] : index + // CHECK-DAG: %[[REMU_X:.*]] = arith.remui %[[MULX]], %{{.*}} : index // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index @@ -529,8 +526,8 @@ gpu.module @test_distribution { gpu.func @non_splat_constant() { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %{{.*}} - // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[REMU]], %{{.*}} + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %{{.*}} + // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[REMU]], %{{.*}} // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex> @@ -551,9 +548,9 @@ gpu.module @test_distribution { // CHECK-LABEL: vector_mask_1D gpu.func @vector_mask_1D() { // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C2:.*]] - // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C16:.*]] - // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MUL]], %[[C32:.*]] + // CHECK-DAG: %[[REMU:.*]] = arith.remui %[[SGID]], %[[C2:.*]] + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index + // CHECK-DAG: %[[REMU2:.*]] = arith.remui %[[MUL]], %[[C32:.*]] : index // CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C8:.*]], %[[REMU2]] : index // CHECK-DAG: %[[MAX:.*]] = arith.maxsi %[[SUB]], %[[C0:.*]] : index // CHECK-DAG: %[[MIN:.*]] = arith.minsi %[[MAX]], %[[C16:.*]] : index @@ -565,13 +562,13 @@ gpu.module @test_distribution { // CHECK-LABEL: vector_mask_2D gpu.func @vector_mask_2D() { // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8:.*]] - // CHECK-DAG: %[[ROW:.*]] = index.mul %[[SGIDY]], %[[C32:.*]] - // CHECK-DAG: %[[COL:.*]] = index.mul %[[SGIDX]], %[[C32:.*]] - // CHECK-DAG: %[[MODROW:.*]] = index.remu %[[ROW]], %[[C256:.*]] - // CHECK-DAG: %[[MODCOL:.*]] = index.remu %[[COL]], %[[C128:.*]] + // CHECK-DAG: %[[SGIDX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[SGIDY_TMP:.*]] = arith.divui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[SGIDY:.*]] = arith.remui %[[SGIDY_TMP]], %[[C8:.*]] + // CHECK-DAG: %[[ROW:.*]] = arith.muli %[[SGIDY]], %[[C32:.*]] : index + // CHECK-DAG: %[[COL:.*]] = arith.muli %[[SGIDX]], %[[C32:.*]] : index + // CHECK-DAG: %[[MODROW:.*]] = arith.remui %[[ROW]], %[[C256:.*]] : index + // CHECK-DAG: %[[MODCOL:.*]] = arith.remui %[[COL]], %[[C128:.*]] : index // CHECK-DAG: %[[SUBROW:.*]] = arith.subi %[[C16:.*]], %[[MODROW]] : index // CHECK-DAG: %[[MAXROW:.*]] = arith.maxsi %[[SUBROW]], %[[C4:.*]] : index // CHECK-DAG: %[[MINROW:.*]] = arith.minsi %[[MAXROW]], %[[C32:.*]] : index diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index 5ce3d1d0fb5d6..a8015cced7eb4 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -5,13 +5,13 @@ gpu.module @test_1_1_assignment { // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]] - // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]] - // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]] - // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]] - // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]] + // CHECK-DAG: %[[REMUX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[REMUY:.*]] = arith.remui %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[REMUY]], %[[C32:.*]] + // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[REMUX]], %[[C32:.*]] + // CHECK-DAG: %[[MODY:.*]] = arith.remui %[[MULY]], %[[C256:.*]] + // CHECK-DAG: %[[MODX:.*]] = arith.remui %[[MULX]], %[[C128:.*]] // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> @@ -22,13 +22,13 @@ gpu.module @test_1_1_assignment { // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32> gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) { // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]] - // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]] - // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]] - // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]] - // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]] - // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]] + // CHECK-DAG: %[[REMUX:.*]] = arith.remui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[DIVU:.*]] = arith.divui %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[REMUY:.*]] = arith.remui %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MULY:.*]] = arith.muli %[[REMUY]], %[[C32:.*]] + // CHECK-DAG: %[[MULX:.*]] = arith.muli %[[REMUX]], %[[C32:.*]] + // CHECK-DAG: %[[MODY:.*]] = arith.remui %[[MULY]], %[[C256:.*]] + // CHECK-DAG: %[[MODX:.*]] = arith.remui %[[MULX]], %[[C128:.*]] // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout>