Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions mlir/test/Examples/NVGPU/Ch0.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 0 : Hello World
Expand Down Expand Up @@ -33,7 +37,7 @@ def kernel():
# + operator generates arith.addi
myValue = alpha + tidx
# Print from a GPU thread
gpu.printf("GPU thread %llu has %llu\n", [tidx, myValue])
gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)

# 3. Call the GPU kernel
kernel()
Expand All @@ -43,8 +47,24 @@ def kernel():
# 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
main(alpha)


# CHECK: GPU thread 0 has 100
# CHECK: GPU thread 1 has 101
# CHECK: GPU thread 2 has 102
# CHECK: GPU thread 3 has 103

# DUMPIR: func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_0:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C4:.*]] = arith.constant 4 : index
# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
# DUMPIR: gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
# DUMPIR: %[[TIDX:.*]] = gpu.thread_id x
# DUMPIR: %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
# DUMPIR: gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
# DUMPIR: gpu.terminator
# DUMPIR: }
# DUMPIR: return
# DUMPIR: }
43 changes: 34 additions & 9 deletions mlir/test/Examples/NVGPU/Ch1.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 1 : 2D Saxpy
Expand All @@ -24,12 +28,12 @@
def saxpy(x, y, alpha):
# 1. Use MLIR GPU dialect to allocate and copy memory
token_ty = gpu.AsyncTokenType.get()
t1 = gpu.wait(token_ty, [])
t1 = gpu.wait([])
x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
t6 = gpu.wait(token_ty, [t5])
t6 = gpu.wait([t5])

# 2. Compute 2D SAXPY kernel
@NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1))
Expand All @@ -47,7 +51,7 @@ def saxpy_kernel():
saxpy_kernel()

t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
gpu.wait(token_ty, [t7])
gpu.wait([t7])


# 3. Pass numpy arrays to MLIR
Expand All @@ -56,11 +60,32 @@ def saxpy_kernel():
alpha = 2.0
x = np.random.randn(M, N).astype(np.float32)
y = np.ones((M, N), np.float32)

saxpy(x, y, alpha)

# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
# CHECK-NOT: Mismatched elements
# CHECK: PASS

# DUMPIR: func.func @saxpy(%[[ARG0:.*]]: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
# DUMPIR: %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
# DUMPIR: %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %[[ARG0]] : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %[[ARG1]] : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
# DUMPIR: %[[LD0:.*]] = memref.load %[[MEMREF]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
# DUMPIR: %[[LD1:.*]] = memref.load %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
# DUMPIR: %[[MUL:.*]] = arith.mulf %[[LD0]], %[[ARG2]] : f32
# DUMPIR: %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
# DUMPIR: memref.store %[[ADD]], %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
# DUMPIR: gpu.terminator
# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %[[ARG1]], %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
# DUMPIR: return
# DUMPIR: }
59 changes: 50 additions & 9 deletions mlir/test/Examples/NVGPU/Ch2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 2 : 2D Saxpy with TMA
Expand Down Expand Up @@ -28,12 +32,12 @@
@NVDSL.mlir_func
def saxpy(x, y, alpha):
token_ty = gpu.AsyncTokenType.get()
t1 = gpu.wait(token_ty, [])
t1 = gpu.wait([])
x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
t6 = gpu.wait(token_ty, [t5])
t6 = gpu.wait([t5])

x_tma = TMA([1, N], x.type)
y_tma = TMA([1, N], y.type)
Expand Down Expand Up @@ -74,7 +78,7 @@ def saxpy_tma_kernel():
saxpy_tma_kernel()

t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
gpu.wait(token_ty, [t7])
gpu.wait([t7])


# 3. Pass numpy arrays to MLIR
Expand All @@ -85,9 +89,46 @@ def saxpy_tma_kernel():
y = np.ones((M, N), np.float32)
saxpy(x, y, alpha)

# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
# 4. Verify MLIR with reference computation
ref = np.ones((M, N), np.float32)
ref += x * alpha
np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
print("PASS")
# CHECK-NOT: Mismatched elements
# CHECK: PASS

# DUMPIR: func.func @saxpy(%{{.*}}: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
# DUMPIR: %[[CAST:.*]] = memref.cast %[[MEMREF]] : memref<256x32xf32> to memref<*xf32>
# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
# DUMPIR: %[[C32:.*]] = arith.constant 32 : index
# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %[[CAST]] box[%[[C1]], %[[C32]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %{{.*}}, %[[C0]] : index
# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_10:.*]] = arith.constant 0 : index
# DUMPIR: %[[C1_11:.*]] = arith.constant 1 : index
# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_10]]], %[[C1_11]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_12]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
# DUMPIR: %[[VIEW_13:.*]] = memref.view %[[DSM1]][%[[C128]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%{{.*}}, %{{.*}}], %[[MB]][%{{.*}}] to %[[VIEW]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%{{.*}}], %{{.*}}, predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_20:.*]] = arith.constant 0 : index
# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
# DUMPIR: %[[FALSE:.*]] = arith.constant false
# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_20]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
# DUMPIR: %[[LD0:.*]] = memref.load %[[VIEW]][%[[C0_21]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
# DUMPIR: %[[LD1:.*]] = memref.load %[[VIEW_13]][%[[C0_22]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
# DUMPIR: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<256x32xf32>
# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG1]], %{{.*}} : memref<256x32xf32>, memref<256x32xf32>
# DUMPIR: %{{.*}} = gpu.wait async [%[[MEMCPY3]]]
# DUMPIR: return
# DUMPIR: }
84 changes: 77 additions & 7 deletions mlir/test/Examples/NVGPU/Ch3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
# RUN: %PYTHON %s | FileCheck %s
# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
# RUN: then %PYTHON %s | FileCheck %s; \
# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'


# ===----------------------------------------------------------------------===//
# Chapter 3 : GEMM 128x128x64 with Tensor Core
Expand Down Expand Up @@ -60,13 +64,13 @@ def tma_load(
@NVDSL.mlir_func
def gemm_128_128_64(a, b, d):
token_ty = gpu.AsyncTokenType.get()
t1 = gpu.wait(token_ty, [])
t1 = gpu.wait([])
a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
d_dev, t4 = gpu.alloc(d.type, token_ty, [t3], [], [])
t5 = gpu.memcpy(token_ty, [t4], a_dev, a)
t6 = gpu.memcpy(token_ty, [t5], b_dev, b)
t7 = gpu.wait(token_ty, [t6])
t7 = gpu.wait([t6])

sw = nvgpu.TensorMapSwizzleKind.SWIZZLE_128B
a_tma = TMA([128, 64], a.type, swizzle=sw)
Expand Down Expand Up @@ -111,7 +115,7 @@ def gemm_tma_kernel():
gemm_tma_kernel()

t8 = gpu.memcpy(token_ty, [t7], d, d_dev)
gpu.wait(None, [t8])
gpu.wait([t8])


# Python pass arguments to MLIR
Expand All @@ -123,7 +127,73 @@ def gemm_tma_kernel():
d = np.zeros((M, N), np.float32)
gemm_128_128_64(a, b, d)

ref_d = a.astype(np.float16) @ b.astype(np.float16)
np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
print("PASS")
if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
# Verify MLIR program with reference computation in python
ref_d = a.astype(np.float16) @ b.astype(np.float16)
np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
print("PASS")
# CHECK-NOT: Mismatched elements
# CHECK: PASS

# DUMPIR: func.func @gemm_128_128_64(%{{.*}}: memref<128x64xf16>, %{{.*}}: memref<64x128xf16>, %[[ARG2:.*]]: memref<128x128xf32>) attributes {llvm.emit_c_interface} {
# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
# DUMPIR: %[[C64:.*]] = arith.constant 64 : index
# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %{{.*}} box[%[[C128]], %[[C64]]] : memref<*xf16> -> <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
# DUMPIR: %[[CAST1:.*]] = memref.cast %{{.*}} : memref<64x128xf16> to memref<*xf16>
# DUMPIR: %[[C64_5:.*]] = arith.constant 64 : index
# DUMPIR: %[[C64_6:.*]] = arith.constant 64 : index
# DUMPIR: %[[TMA1:.*]] = nvgpu.tma.create.descriptor %[[CAST1]] box[%[[C64_5]], %[[C64_6]]] : memref<*xf16> -> <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
# DUMPIR: %[[THREADID:.*]] = gpu.thread_id x
# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %[[THREADID]], %[[C0]] : index
# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
# DUMPIR: %[[C1_13:.*]] = arith.constant 1 : index
# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_12]]], %[[C1_13]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: nvgpu.tma.prefetch.descriptor %[[TMA0]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
# DUMPIR: nvgpu.tma.prefetch.descriptor %[[TMA1]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_14:.*]] = arith.constant 0 : index
# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_14]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C16384:.*]] = arith.constant 16384 : index
# DUMPIR: %[[VIEW_15:.*]] = memref.view %[[DSM1]][%[[C16384]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x128xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM2:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_16:.*]] = arith.constant 0 : index
# DUMPIR: %[[VIEW_17:.*]] = memref.view %[[DSM2]][%[[C0_16]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM3:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C16384_18:.*]] = arith.constant 16384 : index
# DUMPIR: %[[VIEW_19:.*]] = memref.view %[[DSM3]][%[[C16384_18]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[DSM4:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
# DUMPIR: %[[C24576:.*]] = arith.constant 24576 : index
# DUMPIR: %[[VIEW_20:.*]] = memref.view %[[DSM4]][%[[C24576]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
# DUMPIR: %[[C32768:.*]] = arith.constant 32768 : index
# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%[[C0_21]]], %[[C32768]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
# DUMPIR: %[[C0_23:.*]] = arith.constant 0 : index
# DUMPIR: %[[C0_24:.*]] = arith.constant 0 : index
# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%[[C0_23]], %[[C0_24]]], %[[MB]][%[[C0_22]]] to %[[VIEW_17]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<128x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_25:.*]] = arith.constant 0 : index
# DUMPIR: %[[C0_26:.*]] = arith.constant 0 : index
# DUMPIR: %[[C0_27:.*]] = arith.constant 0 : index
# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C0_26]], %[[C0_27]]], %[[MB]][%[[C0_25]]] to %[[VIEW_19]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_28:.*]] = arith.constant 0 : index
# DUMPIR: %[[C64_29:.*]] = arith.constant 64 : index
# DUMPIR: %[[C0_30:.*]] = arith.constant 0 : index
# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C64_29]], %[[C0_30]]], %[[MB]][%[[C0_28]]] to %[[VIEW_20]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
# DUMPIR: %[[C0_31:.*]] = arith.constant 0 : index
# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
# DUMPIR: %[[FALSE:.*]] = arith.constant false
# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_31]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
# DUMPIR: %[[WG_ACC:.*]] = nvgpu.warpgroup.mma.init.accumulator -> <fragmented = vector<128x128xf32>>
# DUMPIR: %[[GEN0:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW]], %[[TMA0]] : memref<128x64xf16, #gpu.address_space<workgroup>>, <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>
# DUMPIR: %[[GEN1:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW_15]], %[[TMA1]] : memref<64x128xf16, #gpu.address_space<workgroup>>, <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>
# DUMPIR: %[[MMA:.*]] = nvgpu.warpgroup.mma %[[GEN0]], %[[GEN1]], %[[WG_ACC]] {transposeB} : <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>, <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>, <fragmented = vector<128x128xf32>> -> <fragmented = vector<128x128xf32>>
# DUMPIR: nvgpu.warpgroup.mma.store %[[MMA]], %{{.*}} : <fragmented = vector<128x128xf32>> to memref<128x128xf32>
# DUMPIR: gpu.terminator
# DUMPIR: }
# DUMPIR: %[[CPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG2]], %{{.*}} : memref<128x128xf32>, memref<128x128xf32>
# DUMPIR: gpu.wait async [%[[CPY3]]]
# DUMPIR: return
# DUMPIR: }
Loading