[NVGPU] Fix nvdsl examples - take 2 #167321

castigli · 2025-11-10T14:39:15Z

This PR re-lands #156830

This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI.

The fixed bugs were related to the following PRs:

move to nanobind [mlir python] Port Python core code to nanobind. #118583
split gpu module initialization [mlir][gpu] Change GPU modules to globals #135478
gpu dialect python API change [mlir][python] Add Pythonic wrappers for gpu ops #163883

…ithout running

llvmbot · 2025-11-10T14:39:48Z

@llvm/pr-subscribers-mlir-gpu
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-nvgpu

Author: Giacomo Castiglioni (castigli)

Changes

This PR re-lands #156830

This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI.

The fixed bugs were related to the following PRs:

move to nanobind #118583
split gpu module initialization #135478
gpu dialect python API change #163883

Patch is 56.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167321.diff

9 Files Affected:

(modified) mlir/test/Examples/NVGPU/Ch0.py (+23-3)
(modified) mlir/test/Examples/NVGPU/Ch1.py (+34-9)
(modified) mlir/test/Examples/NVGPU/Ch2.py (+50-9)
(modified) mlir/test/Examples/NVGPU/Ch3.py (+77-7)
(modified) mlir/test/Examples/NVGPU/Ch4.py (+156-10)
(modified) mlir/test/Examples/NVGPU/Ch5.py (+170-11)
(modified) mlir/test/Examples/NVGPU/lit.local.cfg (+1-1)
(modified) mlir/test/Examples/NVGPU/tools/nvdsl.py (+17-18)
(modified) mlir/test/Examples/NVGPU/tools/nvgpucompiler.py (+3-1)

diff --git a/mlir/test/Examples/NVGPU/Ch0.py b/mlir/test/Examples/NVGPU/Ch0.py
index 8f60088178d11..e09720a0f3b75 100644
--- a/mlir/test/Examples/NVGPU/Ch0.py
+++ b/mlir/test/Examples/NVGPU/Ch0.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 0 : Hello World
@@ -33,7 +37,7 @@ def kernel():
         # + operator generates arith.addi
         myValue = alpha + tidx
         # Print from a GPU thread
-        gpu.printf("GPU thread %llu has %llu\n", [tidx, myValue])
+        gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)
 
     # 3. Call the GPU kernel
     kernel()
@@ -43,8 +47,24 @@ def kernel():
 # 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
 main(alpha)
 
-
 # CHECK: GPU thread 0 has 100
 # CHECK: GPU thread 1 has 101
 # CHECK: GPU thread 2 has 102
 # CHECK: GPU thread 3 has 103
+
+# DUMPIR:   func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[C0_I32:.*]] = arith.constant 0 : i32
+# DUMPIR:     %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C1_0:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C1_1:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C4:.*]] = arith.constant 4 : index
+# DUMPIR:     %[[C1_2:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C1_3:.*]] = arith.constant 1 : index
+# DUMPIR:     gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
+# DUMPIR:       %[[TIDX:.*]] = gpu.thread_id  x
+# DUMPIR:       %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
+# DUMPIR:       gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
+# DUMPIR:       gpu.terminator
+# DUMPIR:     }
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch1.py b/mlir/test/Examples/NVGPU/Ch1.py
index cfb48d56f8d49..6e44e4d04fa06 100644
--- a/mlir/test/Examples/NVGPU/Ch1.py
+++ b/mlir/test/Examples/NVGPU/Ch1.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 1 : 2D Saxpy
@@ -24,12 +28,12 @@
 def saxpy(x, y, alpha):
     # 1. Use MLIR GPU dialect to allocate and copy memory
     token_ty = gpu.AsyncTokenType.get()
-    t1 = gpu.wait(token_ty, [])
+    t1 = gpu.wait([])
     x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
     y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
     t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
     t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
-    t6 = gpu.wait(token_ty, [t5])
+    t6 = gpu.wait([t5])
 
     # 2. Compute 2D SAXPY kernel
     @NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1))
@@ -47,7 +51,7 @@ def saxpy_kernel():
     saxpy_kernel()
 
     t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
-    gpu.wait(token_ty, [t7])
+    gpu.wait([t7])
 
 
 # 3. Pass numpy arrays to MLIR
@@ -56,11 +60,32 @@ def saxpy_kernel():
 alpha = 2.0
 x = np.random.randn(M, N).astype(np.float32)
 y = np.ones((M, N), np.float32)
+
 saxpy(x, y, alpha)
 
-#  4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+    # 4. Verify MLIR with reference computation
+    ref = np.ones((M, N), np.float32)
+    ref += x * alpha
+    np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+    print("PASS")
 # CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR:   func.func @saxpy(%[[ARG0:.*]]: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR:     %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR:     %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
+# DUMPIR:     %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %[[ARG0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %[[ARG1]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
+# DUMPIR:       %[[LD0:.*]] = memref.load %[[MEMREF]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:       %[[LD1:.*]] = memref.load %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:       %[[MUL:.*]] = arith.mulf %[[LD0]], %[[ARG2]] : f32
+# DUMPIR:       %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
+# DUMPIR:       memref.store %[[ADD]], %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:       gpu.terminator
+# DUMPIR:     %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %[[ARG1]], %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch2.py b/mlir/test/Examples/NVGPU/Ch2.py
index 729913c6d5c4f..aba610cee0b34 100644
--- a/mlir/test/Examples/NVGPU/Ch2.py
+++ b/mlir/test/Examples/NVGPU/Ch2.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 2 : 2D Saxpy with TMA
@@ -28,12 +32,12 @@
 @NVDSL.mlir_func
 def saxpy(x, y, alpha):
     token_ty = gpu.AsyncTokenType.get()
-    t1 = gpu.wait(token_ty, [])
+    t1 = gpu.wait([])
     x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
     y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
     t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
     t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
-    t6 = gpu.wait(token_ty, [t5])
+    t6 = gpu.wait([t5])
 
     x_tma = TMA([1, N], x.type)
     y_tma = TMA([1, N], y.type)
@@ -74,7 +78,7 @@ def saxpy_tma_kernel():
     saxpy_tma_kernel()
 
     t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
-    gpu.wait(token_ty, [t7])
+    gpu.wait([t7])
 
 
 # 3. Pass numpy arrays to MLIR
@@ -85,9 +89,46 @@ def saxpy_tma_kernel():
 y = np.ones((M, N), np.float32)
 saxpy(x, y, alpha)
 
-#  4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+    #  4. Verify MLIR with reference computation
+    ref = np.ones((M, N), np.float32)
+    ref += x * alpha
+    np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+    print("PASS")
 # CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR:   func.func @saxpy(%{{.*}}: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR:     %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR:     %[[CAST:.*]] = memref.cast %[[MEMREF]] : memref<256x32xf32> to memref<*xf32>
+# DUMPIR:     %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR:     %[[C32:.*]] = arith.constant 32 : index
+# DUMPIR:     %[[TMA0:.*]] = nvgpu.tma.create.descriptor %[[CAST]] box[%[[C1]], %[[C32]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[EQ:.*]] = arith.cmpi eq, %{{.*}}, %[[C0]] : index
+# DUMPIR:       %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_10:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C1_11:.*]] = arith.constant 1 : index
+# DUMPIR:       nvgpu.mbarrier.init %[[MB]][%[[C0_10]]], %[[C1_11]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_12]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR:       %[[VIEW_13:.*]] = memref.view %[[DSM1]][%[[C128]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       nvgpu.tma.async.load %[[TMA0]][%{{.*}}, %{{.*}}], %[[MB]][%{{.*}}] to %[[VIEW]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       nvgpu.mbarrier.arrive.expect_tx %[[MB]][%{{.*}}], %{{.*}}, predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_20:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR:       %[[FALSE:.*]] = arith.constant false
+# DUMPIR:       nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_20]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[LD0:.*]] = memref.load %[[VIEW]][%[[C0_21]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[LD1:.*]] = memref.load %[[VIEW_13]][%[[C0_22]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR:       memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR:     %[[MEMCPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG1]], %{{.*}} : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR:     %{{.*}} = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch3.py b/mlir/test/Examples/NVGPU/Ch3.py
index eb96b11c63416..fe11575416866 100644
--- a/mlir/test/Examples/NVGPU/Ch3.py
+++ b/mlir/test/Examples/NVGPU/Ch3.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 3 : GEMM 128x128x64 with Tensor Core
@@ -60,13 +64,13 @@ def tma_load(
 @NVDSL.mlir_func
 def gemm_128_128_64(a, b, d):
     token_ty = gpu.AsyncTokenType.get()
-    t1 = gpu.wait(token_ty, [])
+    t1 = gpu.wait([])
     a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
     b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
     d_dev, t4 = gpu.alloc(d.type, token_ty, [t3], [], [])
     t5 = gpu.memcpy(token_ty, [t4], a_dev, a)
     t6 = gpu.memcpy(token_ty, [t5], b_dev, b)
-    t7 = gpu.wait(token_ty, [t6])
+    t7 = gpu.wait([t6])
 
     sw = nvgpu.TensorMapSwizzleKind.SWIZZLE_128B
     a_tma = TMA([128, 64], a.type, swizzle=sw)
@@ -111,7 +115,7 @@ def gemm_tma_kernel():
     gemm_tma_kernel()
 
     t8 = gpu.memcpy(token_ty, [t7], d, d_dev)
-    gpu.wait(None, [t8])
+    gpu.wait([t8])
 
 
 # Python pass arguments to MLIR
@@ -123,7 +127,73 @@ def gemm_tma_kernel():
 d = np.zeros((M, N), np.float32)
 gemm_128_128_64(a, b, d)
 
-ref_d = a.astype(np.float16) @ b.astype(np.float16)
-np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+    # Verify MLIR program with reference computation in python
+    ref_d = a.astype(np.float16) @ b.astype(np.float16)
+    np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
+    print("PASS")
 # CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR:   func.func @gemm_128_128_64(%{{.*}}: memref<128x64xf16>, %{{.*}}: memref<64x128xf16>, %[[ARG2:.*]]: memref<128x128xf32>) attributes {llvm.emit_c_interface} {
+# DUMPIR:     %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR:     %[[C64:.*]] = arith.constant 64 : index
+# DUMPIR:     %[[TMA0:.*]] = nvgpu.tma.create.descriptor %{{.*}} box[%[[C128]], %[[C64]]] : memref<*xf16> -> <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:     %[[CAST1:.*]] = memref.cast %{{.*}} : memref<64x128xf16> to memref<*xf16>
+# DUMPIR:     %[[C64_5:.*]] = arith.constant 64 : index
+# DUMPIR:     %[[C64_6:.*]] = arith.constant 64 : index
+# DUMPIR:     %[[TMA1:.*]] = nvgpu.tma.create.descriptor %[[CAST1]] box[%[[C64_5]], %[[C64_6]]] : memref<*xf16> -> <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       %[[THREADID:.*]] = gpu.thread_id  x
+# DUMPIR:       %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[EQ:.*]] = arith.cmpi eq, %[[THREADID]], %[[C0]] : index
+# DUMPIR:       %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C1_13:.*]] = arith.constant 1 : index
+# DUMPIR:       nvgpu.mbarrier.init %[[MB]][%[[C0_12]]], %[[C1_13]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       nvgpu.tma.prefetch.descriptor %[[TMA0]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       nvgpu.tma.prefetch.descriptor %[[TMA1]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR:       %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_14:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_14]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C16384:.*]] = arith.constant 16384 : index
+# DUMPIR:       %[[VIEW_15:.*]] = memref.view %[[DSM1]][%[[C16384]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x128xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM2:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_16:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[VIEW_17:.*]] = memref.view %[[DSM2]][%[[C0_16]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM3:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C16384_18:.*]] = arith.constant 16384 : index
+# DUMPIR:       %[[VIEW_19:.*]] = memref.view %[[DSM3]][%[[C16384_18]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[DSM4:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C24576:.*]] = arith.constant 24576 : index
+# DUMPIR:       %[[VIEW_20:.*]] = memref.view %[[DSM4]][%[[C24576]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C32768:.*]] = arith.constant 32768 : index
+# DUMPIR:       nvgpu.mbarrier.arrive.expect_tx %[[MB]][%[[C0_21]]], %[[C32768]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_23:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_24:.*]] = arith.constant 0 : index
+# DUMPIR:       nvgpu.tma.async.load %[[TMA0]][%[[C0_23]], %[[C0_24]]], %[[MB]][%[[C0_22]]] to %[[VIEW_17]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_25:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_26:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C0_27:.*]] = arith.constant 0 : index
+# DUMPIR:       nvgpu.tma.async.load %[[TMA1]][%[[C0_26]], %[[C0_27]]], %[[MB]][%[[C0_25]]] to %[[VIEW_19]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_28:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C64_29:.*]] = arith.constant 64 : index
+# DUMPIR:       %[[C0_30:.*]] = arith.constant 0 : index
+# DUMPIR:       nvgpu.tma.async.load %[[TMA1]][%[[C64_29]], %[[C0_30]]], %[[MB]][%[[C0_28]]] to %[[VIEW_20]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR:       %[[C0_31:.*]] = arith.constant 0 : index
+# DUMPIR:       %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR:       %[[FALSE:.*]] = arith.constant false
+# DUMPIR:       nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_31]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR:       %[[WG_ACC:.*]] = nvgpu.warpgroup.mma.init.accumulator -> <fragmented = vector<128x128xf32>>
+# DUMPIR:       %[[GEN0:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW]], %[[TMA0]] : memref<128x64xf16, #gpu.address_space<workgroup>>, <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>
+# DUMPIR:       %[[GEN1:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW_15]], %[[TMA1]] : memref<64x128xf16, #gpu.address_space<workgroup>>, <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>
+# DUMPIR:       %[[MMA:.*]] = nvgpu.warpgroup.mma %[[GEN0]], %[[GEN1]], %[[WG_ACC]] {transposeB} : <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>, <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>, <fragmented = vector<128x128xf32>> -> <fragmented = vector<128x128xf32>>
+# DUMPIR:       nvgpu.warpgroup.mma.store %[[MMA]], %{{.*}} : <fragmented = vector<128x128xf32>> to memref<128x128xf32>
+# DUMPIR:       gpu.terminator
+# DUMPIR:     }
+# DUMPIR:     %[[CPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG2]], %{{.*}} : memref<128x128xf32>, memref<128x128xf32>
+# DUMPIR:     gpu.wait async [%[[CPY3]]]
+# DUMPIR:     return
+# DUMPIR:   }
diff --git a/mlir/test/Examples/NVGPU/Ch4.py b/mlir/test/Examples/NVGPU/Ch4.py
index 0e3460ff8d63b..dffafda7f21c9 100644
--- a/mlir/test/Examples/NVGPU/Ch4.py
+++ b/mlir/test/Examples/NVGPU/Ch4.py
@@ -1,5 +1,9 @@
 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN:   %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
 
 # ===----------------------------------------------------------------------===//
 #  Chapter 4 : Multistage GEMM with Tensor Core
@@ -259,13 +263,13 @@ def epilogue(D: WGMMAMatrix, d_dev):
 @N...
[truncated]

castigli added 6 commits November 10, 2025 10:58

fix nvdsl

3da7d3c

format

1c755c5

check either computation if sm_90 is available or dump and check IR w…

6598d8c

…ithout running

address review, remove dump ir from decorator

78f92c8

format python files

b790196

fix breakage due to new bindings and reduce lit check

487e41d

castigli requested a review from grypp as a code owner November 10, 2025 14:39

llvmbot added mlir:gpu mlir mlir:nvgpu labels Nov 10, 2025

castigli mentioned this pull request Nov 10, 2025

[mlir][python] Add Pythonic wrappers for gpu ops #163883

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[NVGPU] Fix nvdsl examples - take 2 #167321

[NVGPU] Fix nvdsl examples - take 2 #167321

castigli commented Nov 10, 2025

Uh oh!

llvmbot commented Nov 10, 2025 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[NVGPU] Fix nvdsl examples - take 2 #167321

Are you sure you want to change the base?

[NVGPU] Fix nvdsl examples - take 2 #167321

Conversation

castigli commented Nov 10, 2025

Uh oh!

llvmbot commented Nov 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

llvmbot commented Nov 10, 2025 •

edited

Loading