Skip to content

Conversation

@castigli
Copy link
Contributor

This PR re-lands #156830

This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI.

The fixed bugs were related to the following PRs:

@llvmbot
Copy link
Member

llvmbot commented Nov 10, 2025

@llvm/pr-subscribers-mlir-gpu
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-nvgpu

Author: Giacomo Castiglioni (castigli)

Changes

This PR re-lands #156830

This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI.

The fixed bugs were related to the following PRs:

  • move to nanobind #118583
  • split gpu module initialization #135478
  • gpu dialect python API change #163883

Patch is 56.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167321.diff

9 Files Affected:

  • (modified) mlir/test/Examples/NVGPU/Ch0.py (+23-3)
  • (modified) mlir/test/Examples/NVGPU/Ch1.py (+34-9)
  • (modified) mlir/test/Examples/NVGPU/Ch2.py (+50-9)
  • (modified) mlir/test/Examples/NVGPU/Ch3.py (+77-7)
  • (modified) mlir/test/Examples/NVGPU/Ch4.py (+156-10)
  • (modified) mlir/test/Examples/NVGPU/Ch5.py (+170-11)
  • (modified) mlir/test/Examples/NVGPU/lit.local.cfg (+1-1)
  • (modified) mlir/test/Examples/NVGPU/tools/nvdsl.py (+17-18)
  • (modified) mlir/test/Examples/NVGPU/tools/nvgpucompiler.py (+3-1)
diff --git a/mlir/test/Examples/NVGPU/Ch0.py b/mlir/test/Examples/NVGPU/Ch0.py
index 8f60088178d11..e09720a0f3b75 100644
--- a/mlir/test/Examples/NVGPU/Ch0.py
+++ b/mlir/test/Examples/NVGPU/Ch0.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 0 : Hello World
@@ -33,7 +37,7 @@ def kernel():
         # + operator generates arith.addi
         myValue = alpha + tidx
         # Print from a GPU thread
-        gpu.printf("GPU thread %llu has %llu\n", [tidx, myValue])
+        gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)
 
     # 3. Call the GPU kernel
     kernel()
@@ -43,8 +47,24 @@ def kernel():
 # 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
 main(alpha)
 
-
 # CHECK: GPU thread 0 has 100
 # CHECK: GPU thread 1 has 101
 # CHECK: GPU thread 2 has 102
 # CHECK: GPU thread 3 has 103
+
+# DUMPIR:   func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[C0_I32:.*]] = arith.constant 0 : i32
+# DUMPIR:     %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C1_0:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C1_1:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C4:.*]] = arith.constant 4 : index
+# DUMPIR:     %[[C1_2:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C1_3:.*]] = arith.constant 1 : index
+# DUMPIR:     gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
+# DUMPIR:       %[[TIDX:.*]] = gpu.thread_id  x
+# DUMPIR:       %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
+# DUMPIR:       gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
+# DUMPIR:       gpu.terminator
+# DUMPIR:     }
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch1.py b/mlir/test/Examples/NVGPU/Ch1.py
index cfb48d56f8d49..6e44e4d04fa06 100644
--- a/mlir/test/Examples/NVGPU/Ch1.py
+++ b/mlir/test/Examples/NVGPU/Ch1.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 1 : 2D Saxpy
@@ -24,12 +28,12 @@
 def saxpy(x, y, alpha):
     # 1. Use MLIR GPU dialect to allocate and copy memory
     token_ty = gpu.AsyncTokenType.get()
-    t1 = gpu.wait(token_ty, [])
+    t1 = gpu.wait([])
     x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
     y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
     t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
     t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
-    t6 = gpu.wait(token_ty, [t5])
+    t6 = gpu.wait([t5])
 
     # 2. Compute 2D SAXPY kernel
     @NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1))
@@ -47,7 +51,7 @@ def saxpy_kernel():
     saxpy_kernel()
 
     t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
-    gpu.wait(token_ty, [t7])
+    gpu.wait([t7])
 
 
 # 3. Pass numpy arrays to MLIR
@@ -56,11 +60,32 @@ def saxpy_kernel():
 alpha = 2.0
 x = np.random.randn(M, N).astype(np.float32)
 y = np.ones((M, N), np.float32)
+
 saxpy(x, y, alpha)
 
-#  4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+    # 4. Verify MLIR with reference computation
+    ref = np.ones((M, N), np.float32)
+    ref += x * alpha
+    np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+    print("PASS")
 # CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR:   func.func @saxpy(%[[ARG0:.*]]: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR:     %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR:     %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
+# DUMPIR:     %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %[[ARG0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %[[ARG1]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
+# DUMPIR:       %[[LD0:.*]] = memref.load %[[MEMREF]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:       %[[LD1:.*]] = memref.load %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:       %[[MUL:.*]] = arith.mulf %[[LD0]], %[[ARG2]] : f32
+# DUMPIR:       %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
+# DUMPIR:       memref.store %[[ADD]], %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:       gpu.terminator
+# DUMPIR:     %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %[[ARG1]], %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch2.py b/mlir/test/Examples/NVGPU/Ch2.py
index 729913c6d5c4f..aba610cee0b34 100644
--- a/mlir/test/Examples/NVGPU/Ch2.py
+++ b/mlir/test/Examples/NVGPU/Ch2.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 2 : 2D Saxpy with TMA
@@ -28,12 +32,12 @@
 @NVDSL.mlir_func
 def saxpy(x, y, alpha):
     token_ty = gpu.AsyncTokenType.get()
-    t1 = gpu.wait(token_ty, [])
+    t1 = gpu.wait([])
     x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
     y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
     t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
     t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
-    t6 = gpu.wait(token_ty, [t5])
+    t6 = gpu.wait([t5])
 
     x_tma = TMA([1, N], x.type)
     y_tma = TMA([1, N], y.type)
@@ -74,7 +78,7 @@ def saxpy_tma_kernel():
     saxpy_tma_kernel()
 
     t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
-    gpu.wait(token_ty, [t7])
+    gpu.wait([t7])
 
 
 # 3. Pass numpy arrays to MLIR
@@ -85,9 +89,46 @@ def saxpy_tma_kernel():
 y = np.ones((M, N), np.float32)
 saxpy(x, y, alpha)
 
-#  4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+    #  4. Verify MLIR with reference computation
+    ref = np.ones((M, N), np.float32)
+    ref += x * alpha
+    np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+    print("PASS")
 # CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR:   func.func @saxpy(%{{.*}}: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR:     %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR:     %[[CAST:.*]] = memref.cast %[[MEMREF]] : memref<256x32xf32> to memref<*xf32>
+# DUMPIR:     %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C32:.*]] = arith.constant 32 : index
+# DUMPIR:     %[[TMA0:.*]] = nvgpu.tma.create.descriptor %[[CAST]] box[%[[C1]], %[[C32]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[EQ:.*]] = arith.cmpi eq, %{{.*}}, %[[C0]] : index
+# DUMPIR:       %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_10:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C1_11:.*]] = arith.constant 1 : index
+# DUMPIR:       nvgpu.mbarrier.init %[[MB]][%[[C0_10]]], %[[C1_11]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_12]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR:       %[[VIEW_13:.*]] = memref.view %[[DSM1]][%[[C128]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       nvgpu.tma.async.load %[[TMA0]][%{{.*}}, %{{.*}}], %[[MB]][%{{.*}}] to %[[VIEW]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       nvgpu.mbarrier.arrive.expect_tx %[[MB]][%{{.*}}], %{{.*}}, predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_20:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR:       %[[FALSE:.*]] = arith.constant false
+# DUMPIR:       nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_20]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[LD0:.*]] = memref.load %[[VIEW]][%[[C0_21]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[LD1:.*]] = memref.load %[[VIEW_13]][%[[C0_22]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:     %[[MEMCPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG1]], %{{.*}} : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %{{.*}} = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch3.py b/mlir/test/Examples/NVGPU/Ch3.py
index eb96b11c63416..fe11575416866 100644
--- a/mlir/test/Examples/NVGPU/Ch3.py
+++ b/mlir/test/Examples/NVGPU/Ch3.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 3 : GEMM 128x128x64 with Tensor Core
@@ -60,13 +64,13 @@ def tma_load(
 @NVDSL.mlir_func
 def gemm_128_128_64(a, b, d):
     token_ty = gpu.AsyncTokenType.get()
-    t1 = gpu.wait(token_ty, [])
+    t1 = gpu.wait([])
     a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
     b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
     d_dev, t4 = gpu.alloc(d.type, token_ty, [t3], [], [])
     t5 = gpu.memcpy(token_ty, [t4], a_dev, a)
     t6 = gpu.memcpy(token_ty, [t5], b_dev, b)
-    t7 = gpu.wait(token_ty, [t6])
+    t7 = gpu.wait([t6])
 
     sw = nvgpu.TensorMapSwizzleKind.SWIZZLE_128B
     a_tma = TMA([128, 64], a.type, swizzle=sw)
@@ -111,7 +115,7 @@ def gemm_tma_kernel():
     gemm_tma_kernel()
 
     t8 = gpu.memcpy(token_ty, [t7], d, d_dev)
-    gpu.wait(None, [t8])
+    gpu.wait([t8])
 
 
 # Python pass arguments to MLIR
@@ -123,7 +127,73 @@ def gemm_tma_kernel():
 d = np.zeros((M, N), np.float32)
 gemm_128_128_64(a, b, d)
 
-ref_d = a.astype(np.float16) @ b.astype(np.float16)
-np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+    # Verify MLIR program with reference computation in python
+    ref_d = a.astype(np.float16) @ b.astype(np.float16)
+    np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
+    print("PASS")
 # CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR:   func.func @gemm_128_128_64(%{{.*}}: memref<128x64xf16>, %{{.*}}: memref<64x128xf16>, %[[ARG2:.*]]: memref<128x128xf32>) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR:     %[[C64:.*]] = arith.constant 64 : index
+# DUMPIR:     %[[TMA0:.*]] = nvgpu.tma.create.descriptor %{{.*}} box[%[[C128]], %[[C64]]] : memref<*xf16> -> <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:     %[[CAST1:.*]] = memref.cast %{{.*}} : memref<64x128xf16> to memref<*xf16>
+# DUMPIR:     %[[C64_5:.*]] = arith.constant 64 : index
+# DUMPIR:     %[[C64_6:.*]] = arith.constant 64 : index
+# DUMPIR:     %[[TMA1:.*]] = nvgpu.tma.create.descriptor %[[CAST1]] box[%[[C64_5]], %[[C64_6]]] : memref<*xf16> -> <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       %[[THREADID:.*]] = gpu.thread_id  x
+# DUMPIR:       %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[EQ:.*]] = arith.cmpi eq, %[[THREADID]], %[[C0]] : index
+# DUMPIR:       %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C1_13:.*]] = arith.constant 1 : index
+# DUMPIR:       nvgpu.mbarrier.init %[[MB]][%[[C0_12]]], %[[C1_13]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       nvgpu.tma.prefetch.descriptor %[[TMA0]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       nvgpu.tma.prefetch.descriptor %[[TMA1]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_14:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_14]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C16384:.*]] = arith.constant 16384 : index
+# DUMPIR:       %[[VIEW_15:.*]] = memref.view %[[DSM1]][%[[C16384]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x128xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM2:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_16:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[VIEW_17:.*]] = memref.view %[[DSM2]][%[[C0_16]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM3:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C16384_18:.*]] = arith.constant 16384 : index
+# DUMPIR:       %[[VIEW_19:.*]] = memref.view %[[DSM3]][%[[C16384_18]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM4:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C24576:.*]] = arith.constant 24576 : index
+# DUMPIR:       %[[VIEW_20:.*]] = memref.view %[[DSM4]][%[[C24576]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C32768:.*]] = arith.constant 32768 : index
+# DUMPIR:       nvgpu.mbarrier.arrive.expect_tx %[[MB]][%[[C0_21]]], %[[C32768]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_23:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_24:.*]] = arith.constant 0 : index
+# DUMPIR:       nvgpu.tma.async.load %[[TMA0]][%[[C0_23]], %[[C0_24]]], %[[MB]][%[[C0_22]]] to %[[VIEW_17]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_25:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_26:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_27:.*]] = arith.constant 0 : index
+# DUMPIR:       nvgpu.tma.async.load %[[TMA1]][%[[C0_26]], %[[C0_27]]], %[[MB]][%[[C0_25]]] to %[[VIEW_19]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_28:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C64_29:.*]] = arith.constant 64 : index
+# DUMPIR:       %[[C0_30:.*]] = arith.constant 0 : index
+# DUMPIR:       nvgpu.tma.async.load %[[TMA1]][%[[C64_29]], %[[C0_30]]], %[[MB]][%[[C0_28]]] to %[[VIEW_20]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_31:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR:       %[[FALSE:.*]] = arith.constant false
+# DUMPIR:       nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_31]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[WG_ACC:.*]] = nvgpu.warpgroup.mma.init.accumulator -> <fragmented = vector<128x128xf32>>
+# DUMPIR:       %[[GEN0:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW]], %[[TMA0]] : memref<128x64xf16, #gpu.address_space<workgroup>>, <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>
+# DUMPIR:       %[[GEN1:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW_15]], %[[TMA1]] : memref<64x128xf16, #gpu.address_space<workgroup>>, <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>
+# DUMPIR:       %[[MMA:.*]] = nvgpu.warpgroup.mma %[[GEN0]], %[[GEN1]], %[[WG_ACC]] {transposeB} : <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>, <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>, <fragmented = vector<128x128xf32>> -> <fragmented = vector<128x128xf32>>
+# DUMPIR:       nvgpu.warpgroup.mma.store %[[MMA]], %{{.*}} : <fragmented = vector<128x128xf32>> to memref<128x128xf32>
+# DUMPIR:       gpu.terminator
+# DUMPIR:     }
+# DUMPIR:     %[[CPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG2]], %{{.*}} : memref<128x128xf32>, memref<128x128xf32>
+# DUMPIR:     gpu.wait async [%[[CPY3]]]
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch4.py b/mlir/test/Examples/NVGPU/Ch4.py
index 0e3460ff8d63b..dffafda7f21c9 100644
--- a/mlir/test/Examples/NVGPU/Ch4.py
+++ b/mlir/test/Examples/NVGPU/Ch4.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 4 : Multistage GEMM with Tensor Core
@@ -259,13 +263,13 @@ def epilogue(D: WGMMAMatrix, d_dev):
 @N...
[truncated]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants