From 7e59be20961fc11028f74d8025fac36b40a08ab3 Mon Sep 17 00:00:00 2001 From: hanhanW Date: Mon, 2 Jun 2025 03:37:15 -0700 Subject: [PATCH 1/3] [mlir][memref] Update tests to use memref.assume_alignment properly. - Update all the lit tests to use the result of memref.assume_alignment, if it is present. - Capture the result of the op in lit tests. Signed-off-by: hanhanW --- mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir | 4 ++-- mlir/test/Dialect/MemRef/emulate-narrow-type.mlir | 2 +- mlir/test/Dialect/MemRef/ops.mlir | 2 +- .../GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir | 6 +++--- .../GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir | 6 +++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir index 8c863bb2d3d65..acfc188574255 100644 --- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir @@ -189,7 +189,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) { // CHECK-NEXT: %[[ALIGN:.*]] = llvm.mlir.constant(16 : index) : i64 // CHECK-NEXT: llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i64)] : i1 // CHECK-INTERFACE: llvm.intr.assume - memref.assume_alignment %0, 16 : memref<4x4xf16> + %1 = memref.assume_alignment %0, 16 : memref<4x4xf16> return } @@ -205,7 +205,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset // CHECK-DAG: %[[ALIGN:.*]] = llvm.mlir.constant(16 : index) : i64 // CHECK-NEXT: llvm.intr.assume %[[TRUE]] ["align"(%[[BUFF_ADDR]], %[[ALIGN]] : !llvm.ptr, i64)] : i1 // CHECK-INTERFACE: llvm.intr.assume - memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>> + %1 = memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>> return } // ----- diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir index 111a02abcc74c..3378d329e8205 100644 --- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir +++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir @@ -63,7 +63,7 @@ func.func @memref_load_i4(%arg0: index) -> i4 { func.func @memref_load_i4_rank2(%arg0: index, %arg1: index) -> i4 { %0 = memref.alloc() : memref<3x125xi4> - %align0 =memref.assume_alignment %0, 64 : memref<3x125xi4> + %align0 = memref.assume_alignment %0, 64 : memref<3x125xi4> %1 = memref.load %align0[%arg0,%arg1] : memref<3x125xi4> return %1 : i4 } diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index 38ee363a7d424..13fdf3cf13510 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -283,7 +283,7 @@ func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) { // CHECK-LABEL: func @assume_alignment // CHECK-SAME: %[[MEMREF:.*]]: memref<4x4xf16> func.func @assume_alignment(%0: memref<4x4xf16>) { - // CHECK: memref.assume_alignment %[[MEMREF]], 16 : memref<4x4xf16> + // CHECK: %{{.*}} = memref.assume_alignment %[[MEMREF]], 16 : memref<4x4xf16> %1 = memref.assume_alignment %0, 16 : memref<4x4xf16> return } diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir index aaa3aff5350ad..6153a11622a4f 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir @@ -120,7 +120,7 @@ func.func @main() { threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1) dynamic_shared_memory_size %shmemSize { - memref.assume_alignment %matrixD, 16 : memref<128x128xf32> + %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %c256 = arith.constant 256 : index %c10000000 = arith.constant 10000000 : index @@ -226,13 +226,13 @@ func.func @main() { scf.for %arg12 = %17 to %c128 step %c4 { %19 = arith.muli %18, %c4 : index %20 = vector.load %accShmemPtr[%arg12, %19] : memref<128x128xf32, 3>, vector<4xf32> - vector.store %20, %matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32> + vector.store %20, %align_matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32> } gpu.terminator } // Step 5. Copy D2H - %5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32> + %5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32> gpu.wait [%token] // Step 6. Compute on host diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir index b257d2b0f1e34..b8e355712d37f 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir @@ -120,7 +120,7 @@ func.func @main() { threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1) dynamic_shared_memory_size %shmemSize { - memref.assume_alignment %matrixD, 16 : memref<128x128xf32> + %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %c256 = arith.constant 256 : index %c10000000 = arith.constant 10000000 : index @@ -234,13 +234,13 @@ func.func @main() { scf.for %arg12 = %17 to %c128 step %c4 { %19 = arith.muli %18, %c4 : index %20 = vector.load %accShmemPtr[%arg12, %19] : memref<128x128xf32, 3>, vector<4xf32> - vector.store %20, %matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32> + vector.store %20, %align_matrixD[%arg12, %19] : memref<128x128xf32>, vector<4xf32> } gpu.terminator } // Step 5. Copy D2H - %5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32> + %5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32> gpu.wait [%token] // Step 6. Compute on host From a1c050ce7170f1ceba0c91b52fba63d2c5df4bf7 Mon Sep 17 00:00:00 2001 From: hanhanW Date: Mon, 2 Jun 2025 03:48:32 -0700 Subject: [PATCH 2/3] hoist the declaration out Signed-off-by: hanhanW --- .../Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir | 2 +- .../GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir index 6153a11622a4f..3697317e0f3d5 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir @@ -106,6 +106,7 @@ func.func @main() { %matrixA:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixB:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixD:2 = gpu.alloc async [%token] () : memref<128x128xf32> + %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %1 = gpu.memcpy async [%token] %matrixA, %matrixAHost : memref<128x128xf16>, memref<128x128xf16> %2 = gpu.memcpy async [%token] %matrixB, %matrixBHost : memref<128x128xf16>, memref<128x128xf16> %castA = memref.cast %matrixA : memref<128x128xf16> to memref<*xf16> @@ -120,7 +121,6 @@ func.func @main() { threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1) dynamic_shared_memory_size %shmemSize { - %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %c256 = arith.constant 256 : index %c10000000 = arith.constant 10000000 : index diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir index b8e355712d37f..b094525a066f6 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir @@ -106,6 +106,7 @@ func.func @main() { %matrixA:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixB:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixD:2 = gpu.alloc async [%token] () : memref<128x128xf32> + %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %1 = gpu.memcpy async [%token] %matrixA, %matrixAHost : memref<128x128xf16>, memref<128x128xf16> %2 = gpu.memcpy async [%token] %matrixB, %matrixBHost : memref<128x128xf16>, memref<128x128xf16> %castA = memref.cast %matrixA : memref<128x128xf16> to memref<*xf16> @@ -120,7 +121,6 @@ func.func @main() { threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1) dynamic_shared_memory_size %shmemSize { - %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %c256 = arith.constant 256 : index %c10000000 = arith.constant 10000000 : index From 9e4d165ad1b2486e7fc076a737bb704bc07e4a82 Mon Sep 17 00:00:00 2001 From: hanhanW Date: Mon, 2 Jun 2025 07:07:21 -0700 Subject: [PATCH 3/3] address comments. Signed-off-by: hanhanW --- .../GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir | 4 ++-- .../GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir index 3697317e0f3d5..a5653f395a2c4 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_f32_f16_f16_128x128x128.mlir @@ -106,7 +106,6 @@ func.func @main() { %matrixA:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixB:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixD:2 = gpu.alloc async [%token] () : memref<128x128xf32> - %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %1 = gpu.memcpy async [%token] %matrixA, %matrixAHost : memref<128x128xf16>, memref<128x128xf16> %2 = gpu.memcpy async [%token] %matrixB, %matrixBHost : memref<128x128xf16>, memref<128x128xf16> %castA = memref.cast %matrixA : memref<128x128xf16> to memref<*xf16> @@ -121,6 +120,7 @@ func.func @main() { threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1) dynamic_shared_memory_size %shmemSize { + %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %c256 = arith.constant 256 : index %c10000000 = arith.constant 10000000 : index @@ -232,7 +232,7 @@ func.func @main() { } // Step 5. Copy D2H - %5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32> + %5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32> gpu.wait [%token] // Step 6. Compute on host diff --git a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir index b094525a066f6..197351f1921e7 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/gemm_pred_f32_f16_f16_128x128x128.mlir @@ -106,7 +106,6 @@ func.func @main() { %matrixA:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixB:2 = gpu.alloc async [%token] () : memref<128x128xf16> %matrixD:2 = gpu.alloc async [%token] () : memref<128x128xf32> - %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %1 = gpu.memcpy async [%token] %matrixA, %matrixAHost : memref<128x128xf16>, memref<128x128xf16> %2 = gpu.memcpy async [%token] %matrixB, %matrixBHost : memref<128x128xf16>, memref<128x128xf16> %castA = memref.cast %matrixA : memref<128x128xf16> to memref<*xf16> @@ -121,6 +120,7 @@ func.func @main() { threads(%arg3, %arg4, %arg5) in (%arg9 = %hc128, %arg10 = %hc1, %arg11 = %hc1) dynamic_shared_memory_size %shmemSize { + %align_matrixD = memref.assume_alignment %matrixD, 16 : memref<128x128xf32> %c256 = arith.constant 256 : index %c10000000 = arith.constant 10000000 : index @@ -240,7 +240,7 @@ func.func @main() { } // Step 5. Copy D2H - %5 = gpu.memcpy async [%token] %matrixDHost, %align_matrixD : memref<128x128xf32>, memref<128x128xf32> + %5 = gpu.memcpy async [%token] %matrixDHost, %matrixD : memref<128x128xf32>, memref<128x128xf32> gpu.wait [%token] // Step 6. Compute on host