-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[NVGPU] Fix nvdsl examples - take 2 #167321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir-nvgpu Author: Giacomo Castiglioni (castigli) ChangesThis PR re-lands #156830 This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI. The fixed bugs were related to the following PRs:
Patch is 56.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167321.diff 9 Files Affected:
diff --git a/mlir/test/Examples/NVGPU/Ch0.py b/mlir/test/Examples/NVGPU/Ch0.py
index 8f60088178d11..e09720a0f3b75 100644
--- a/mlir/test/Examples/NVGPU/Ch0.py
+++ b/mlir/test/Examples/NVGPU/Ch0.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 0 : Hello World
@@ -33,7 +37,7 @@ def kernel():
# + operator generates arith.addi
myValue = alpha + tidx
# Print from a GPU thread
- gpu.printf("GPU thread %llu has %llu\n", [tidx, myValue])
+ gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)
# 3. Call the GPU kernel
kernel()
@@ -43,8 +47,24 @@ def kernel():
# 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
main(alpha)
-
# CHECK: GPU thread 0 has 100
# CHECK: GPU thread 1 has 101
# CHECK: GPU thread 2 has 102
# CHECK: GPU thread 3 has 103
+
+# DUMPIR: func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
+# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_0:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C4:.*]] = arith.constant 4 : index
+# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
+# DUMPIR: gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
+# DUMPIR: %[[TIDX:.*]] = gpu.thread_id x
+# DUMPIR: %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
+# DUMPIR: gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
+# DUMPIR: gpu.terminator
+# DUMPIR: }
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch1.py b/mlir/test/Examples/NVGPU/Ch1.py
index cfb48d56f8d49..6e44e4d04fa06 100644
--- a/mlir/test/Examples/NVGPU/Ch1.py
+++ b/mlir/test/Examples/NVGPU/Ch1.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 1 : 2D Saxpy
@@ -24,12 +28,12 @@
def saxpy(x, y, alpha):
# 1. Use MLIR GPU dialect to allocate and copy memory
token_ty = gpu.AsyncTokenType.get()
- t1 = gpu.wait(token_ty, [])
+ t1 = gpu.wait([])
x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
- t6 = gpu.wait(token_ty, [t5])
+ t6 = gpu.wait([t5])
# 2. Compute 2D SAXPY kernel
@NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1))
@@ -47,7 +51,7 @@ def saxpy_kernel():
saxpy_kernel()
t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
- gpu.wait(token_ty, [t7])
+ gpu.wait([t7])
# 3. Pass numpy arrays to MLIR
@@ -56,11 +60,32 @@ def saxpy_kernel():
alpha = 2.0
x = np.random.randn(M, N).astype(np.float32)
y = np.ones((M, N), np.float32)
+
saxpy(x, y, alpha)
-# 4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # 4. Verify MLIR with reference computation
+ ref = np.ones((M, N), np.float32)
+ ref += x * alpha
+ np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @saxpy(%[[ARG0:.*]]: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR: %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
+# DUMPIR: %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %[[ARG0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %[[ARG1]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
+# DUMPIR: %[[LD0:.*]] = memref.load %[[MEMREF]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: %[[LD1:.*]] = memref.load %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: %[[MUL:.*]] = arith.mulf %[[LD0]], %[[ARG2]] : f32
+# DUMPIR: %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
+# DUMPIR: memref.store %[[ADD]], %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: gpu.terminator
+# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %[[ARG1]], %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch2.py b/mlir/test/Examples/NVGPU/Ch2.py
index 729913c6d5c4f..aba610cee0b34 100644
--- a/mlir/test/Examples/NVGPU/Ch2.py
+++ b/mlir/test/Examples/NVGPU/Ch2.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 2 : 2D Saxpy with TMA
@@ -28,12 +32,12 @@
@NVDSL.mlir_func
def saxpy(x, y, alpha):
token_ty = gpu.AsyncTokenType.get()
- t1 = gpu.wait(token_ty, [])
+ t1 = gpu.wait([])
x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
- t6 = gpu.wait(token_ty, [t5])
+ t6 = gpu.wait([t5])
x_tma = TMA([1, N], x.type)
y_tma = TMA([1, N], y.type)
@@ -74,7 +78,7 @@ def saxpy_tma_kernel():
saxpy_tma_kernel()
t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
- gpu.wait(token_ty, [t7])
+ gpu.wait([t7])
# 3. Pass numpy arrays to MLIR
@@ -85,9 +89,46 @@ def saxpy_tma_kernel():
y = np.ones((M, N), np.float32)
saxpy(x, y, alpha)
-# 4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # 4. Verify MLIR with reference computation
+ ref = np.ones((M, N), np.float32)
+ ref += x * alpha
+ np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @saxpy(%{{.*}}: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR: %[[CAST:.*]] = memref.cast %[[MEMREF]] : memref<256x32xf32> to memref<*xf32>
+# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C32:.*]] = arith.constant 32 : index
+# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %[[CAST]] box[%[[C1]], %[[C32]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %{{.*}}, %[[C0]] : index
+# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_10:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C1_11:.*]] = arith.constant 1 : index
+# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_10]]], %[[C1_11]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_12]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR: %[[VIEW_13:.*]] = memref.view %[[DSM1]][%[[C128]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%{{.*}}, %{{.*}}], %[[MB]][%{{.*}}] to %[[VIEW]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%{{.*}}], %{{.*}}, predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_20:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR: %[[FALSE:.*]] = arith.constant false
+# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_20]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR: %[[LD0:.*]] = memref.load %[[VIEW]][%[[C0_21]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR: %[[LD1:.*]] = memref.load %[[VIEW_13]][%[[C0_22]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG1]], %{{.*}} : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %{{.*}} = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch3.py b/mlir/test/Examples/NVGPU/Ch3.py
index eb96b11c63416..fe11575416866 100644
--- a/mlir/test/Examples/NVGPU/Ch3.py
+++ b/mlir/test/Examples/NVGPU/Ch3.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 3 : GEMM 128x128x64 with Tensor Core
@@ -60,13 +64,13 @@ def tma_load(
@NVDSL.mlir_func
def gemm_128_128_64(a, b, d):
token_ty = gpu.AsyncTokenType.get()
- t1 = gpu.wait(token_ty, [])
+ t1 = gpu.wait([])
a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
d_dev, t4 = gpu.alloc(d.type, token_ty, [t3], [], [])
t5 = gpu.memcpy(token_ty, [t4], a_dev, a)
t6 = gpu.memcpy(token_ty, [t5], b_dev, b)
- t7 = gpu.wait(token_ty, [t6])
+ t7 = gpu.wait([t6])
sw = nvgpu.TensorMapSwizzleKind.SWIZZLE_128B
a_tma = TMA([128, 64], a.type, swizzle=sw)
@@ -111,7 +115,7 @@ def gemm_tma_kernel():
gemm_tma_kernel()
t8 = gpu.memcpy(token_ty, [t7], d, d_dev)
- gpu.wait(None, [t8])
+ gpu.wait([t8])
# Python pass arguments to MLIR
@@ -123,7 +127,73 @@ def gemm_tma_kernel():
d = np.zeros((M, N), np.float32)
gemm_128_128_64(a, b, d)
-ref_d = a.astype(np.float16) @ b.astype(np.float16)
-np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # Verify MLIR program with reference computation in python
+ ref_d = a.astype(np.float16) @ b.astype(np.float16)
+ np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @gemm_128_128_64(%{{.*}}: memref<128x64xf16>, %{{.*}}: memref<64x128xf16>, %[[ARG2:.*]]: memref<128x128xf32>) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR: %[[C64:.*]] = arith.constant 64 : index
+# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %{{.*}} box[%[[C128]], %[[C64]]] : memref<*xf16> -> <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[CAST1:.*]] = memref.cast %{{.*}} : memref<64x128xf16> to memref<*xf16>
+# DUMPIR: %[[C64_5:.*]] = arith.constant 64 : index
+# DUMPIR: %[[C64_6:.*]] = arith.constant 64 : index
+# DUMPIR: %[[TMA1:.*]] = nvgpu.tma.create.descriptor %[[CAST1]] box[%[[C64_5]], %[[C64_6]]] : memref<*xf16> -> <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[THREADID:.*]] = gpu.thread_id x
+# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %[[THREADID]], %[[C0]] : index
+# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C1_13:.*]] = arith.constant 1 : index
+# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_12]]], %[[C1_13]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: nvgpu.tma.prefetch.descriptor %[[TMA0]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: nvgpu.tma.prefetch.descriptor %[[TMA1]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_14:.*]] = arith.constant 0 : index
+# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_14]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C16384:.*]] = arith.constant 16384 : index
+# DUMPIR: %[[VIEW_15:.*]] = memref.view %[[DSM1]][%[[C16384]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x128xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM2:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_16:.*]] = arith.constant 0 : index
+# DUMPIR: %[[VIEW_17:.*]] = memref.view %[[DSM2]][%[[C0_16]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM3:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C16384_18:.*]] = arith.constant 16384 : index
+# DUMPIR: %[[VIEW_19:.*]] = memref.view %[[DSM3]][%[[C16384_18]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM4:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C24576:.*]] = arith.constant 24576 : index
+# DUMPIR: %[[VIEW_20:.*]] = memref.view %[[DSM4]][%[[C24576]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C32768:.*]] = arith.constant 32768 : index
+# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%[[C0_21]]], %[[C32768]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_23:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_24:.*]] = arith.constant 0 : index
+# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%[[C0_23]], %[[C0_24]]], %[[MB]][%[[C0_22]]] to %[[VIEW_17]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_25:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_26:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_27:.*]] = arith.constant 0 : index
+# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C0_26]], %[[C0_27]]], %[[MB]][%[[C0_25]]] to %[[VIEW_19]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_28:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C64_29:.*]] = arith.constant 64 : index
+# DUMPIR: %[[C0_30:.*]] = arith.constant 0 : index
+# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C64_29]], %[[C0_30]]], %[[MB]][%[[C0_28]]] to %[[VIEW_20]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_31:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR: %[[FALSE:.*]] = arith.constant false
+# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_31]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[WG_ACC:.*]] = nvgpu.warpgroup.mma.init.accumulator -> <fragmented = vector<128x128xf32>>
+# DUMPIR: %[[GEN0:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW]], %[[TMA0]] : memref<128x64xf16, #gpu.address_space<workgroup>>, <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>
+# DUMPIR: %[[GEN1:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW_15]], %[[TMA1]] : memref<64x128xf16, #gpu.address_space<workgroup>>, <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>
+# DUMPIR: %[[MMA:.*]] = nvgpu.warpgroup.mma %[[GEN0]], %[[GEN1]], %[[WG_ACC]] {transposeB} : <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>, <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>, <fragmented = vector<128x128xf32>> -> <fragmented = vector<128x128xf32>>
+# DUMPIR: nvgpu.warpgroup.mma.store %[[MMA]], %{{.*}} : <fragmented = vector<128x128xf32>> to memref<128x128xf32>
+# DUMPIR: gpu.terminator
+# DUMPIR: }
+# DUMPIR: %[[CPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG2]], %{{.*}} : memref<128x128xf32>, memref<128x128xf32>
+# DUMPIR: gpu.wait async [%[[CPY3]]]
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch4.py b/mlir/test/Examples/NVGPU/Ch4.py
index 0e3460ff8d63b..dffafda7f21c9 100644
--- a/mlir/test/Examples/NVGPU/Ch4.py
+++ b/mlir/test/Examples/NVGPU/Ch4.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 4 : Multistage GEMM with Tensor Core
@@ -259,13 +263,13 @@ def epilogue(D: WGMMAMatrix, d_dev):
@N...
[truncated]
|
This PR re-lands #156830
This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI.
The fixed bugs were related to the following PRs: