diff --git a/mlir/test/python/dialects/nvvm.py b/mlir/test/python/dialects/nvvm.py index 3eb62bef50de9..d795524222fd2 100644 --- a/mlir/test/python/dialects/nvvm.py +++ b/mlir/test/python/dialects/nvvm.py @@ -15,7 +15,9 @@ def constructAndPrintInModule(f): module = Module.create() with InsertionPoint(module.body): f() + print(module) + module.operation.verify() return f @@ -89,3 +91,133 @@ def my_inline_ptx(a, b, c, d): arith.addf(a, b) arith.addi(c, d) arith.addf(wo0, wo1) + + +@constructAndPrintInModule +def test_barriers(): + i32 = T.i32() + f32 = T.f32() + + @func.FuncOp.from_py_func(i32, i32, f32) + def barriers(mask, vi32, vf32): + c0 = arith.constant(T.i32(), 0) + cffff = arith.constant(T.i32(), 0xFFFF) + res = nvvm.barrier( + res=i32, + barrier_id=c0, + number_of_threads=cffff, + ) + + for reduction in ( + nvvm.BarrierReduction.AND, + nvvm.BarrierReduction.OR, + nvvm.BarrierReduction.POPC, + ): + res = nvvm.barrier( + res=i32, + reduction_op=reduction, + reduction_predicate=res, + ) + + nvvm.barrier0() + nvvm.bar_warp_sync(mask) + nvvm.cluster_arrive() + nvvm.cluster_arrive(aligned=True) + nvvm.cluster_arrive_relaxed() + nvvm.cluster_arrive_relaxed(aligned=True) + nvvm.cluster_wait() + nvvm.cluster_wait(aligned=True) + nvvm.fence_mbarrier_init() + nvvm.bar_warp_sync(mask) + return res + + +# CHECK-LABEL: func.func @barriers( +# CHECK: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: f32) -> i32 { +# CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32 +# CHECK: %[[CONSTANT_1:.*]] = arith.constant 65535 : i32 +# CHECK: %[[BARRIER_0:.*]] = nvvm.barrier id = %[[CONSTANT_0]] number_of_threads = %[[CONSTANT_1]] -> i32 +# CHECK: %[[BARRIER_1:.*]] = nvvm.barrier #nvvm.reduction %[[BARRIER_0]] -> i32 +# CHECK: %[[BARRIER_2:.*]] = nvvm.barrier #nvvm.reduction %[[BARRIER_1]] -> i32 +# CHECK: %[[BARRIER_3:.*]] = nvvm.barrier #nvvm.reduction %[[BARRIER_2]] -> i32 +# CHECK: nvvm.barrier0 +# CHECK: nvvm.bar.warp.sync %[[ARG0]] : i32 +# CHECK: nvvm.cluster.arrive +# CHECK: nvvm.cluster.arrive {aligned} +# CHECK: nvvm.cluster.arrive.relaxed +# CHECK: nvvm.cluster.arrive.relaxed {aligned} +# CHECK: nvvm.cluster.wait +# CHECK: nvvm.cluster.wait {aligned} +# CHECK: nvvm.fence.mbarrier.init +# CHECK: nvvm.bar.warp.sync %[[ARG0]] : i32 +# CHECK: return %[[BARRIER_3]] : i32 +# CHECK: } + + +@constructAndPrintInModule +def test_reductions(): + i32 = T.i32() + f32 = T.f32() + + @func.FuncOp.from_py_func(i32, i32, f32) + def reductions(mask, vi32, vf32): + for abs in (True, False): + for nan in (True, False): + for kind in ( + nvvm.ReduxKind.AND, + nvvm.ReduxKind.MAX, + nvvm.ReduxKind.MIN, + nvvm.ReduxKind.OR, + nvvm.ReduxKind.UMAX, + nvvm.ReduxKind.UMIN, + nvvm.ReduxKind.XOR, + ): + nvvm.redux_sync(i32, vi32, kind, vi32) + + for kind in ( + nvvm.ReduxKind.FMIN, + nvvm.ReduxKind.FMAX, + ): + nvvm.redux_sync(f32, vf32, kind, vi32, abs=abs, nan=nan) + + +# CHECK-LABEL: func.func @reductions( +# CHECK: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32, %[[ARG2:.*]]: f32) { +# CHECK: %[[REDUX_0:.*]] = nvvm.redux.sync and %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_1:.*]] = nvvm.redux.sync max %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_2:.*]] = nvvm.redux.sync min %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_3:.*]] = nvvm.redux.sync or %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_4:.*]] = nvvm.redux.sync umax %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_5:.*]] = nvvm.redux.sync umin %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_6:.*]] = nvvm.redux.sync xor %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_7:.*]] = nvvm.redux.sync fmin %[[ARG2]], %[[ARG1]] {abs = true, nan = true} : f32 -> f32 +# CHECK: %[[REDUX_8:.*]] = nvvm.redux.sync fmax %[[ARG2]], %[[ARG1]] {abs = true, nan = true} : f32 -> f32 +# CHECK: %[[REDUX_9:.*]] = nvvm.redux.sync and %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_10:.*]] = nvvm.redux.sync max %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_11:.*]] = nvvm.redux.sync min %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_12:.*]] = nvvm.redux.sync or %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_13:.*]] = nvvm.redux.sync umax %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_14:.*]] = nvvm.redux.sync umin %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_15:.*]] = nvvm.redux.sync xor %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_16:.*]] = nvvm.redux.sync fmin %[[ARG2]], %[[ARG1]] {abs = true} : f32 -> f32 +# CHECK: %[[REDUX_17:.*]] = nvvm.redux.sync fmax %[[ARG2]], %[[ARG1]] {abs = true} : f32 -> f32 +# CHECK: %[[REDUX_18:.*]] = nvvm.redux.sync and %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_19:.*]] = nvvm.redux.sync max %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_20:.*]] = nvvm.redux.sync min %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_21:.*]] = nvvm.redux.sync or %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_22:.*]] = nvvm.redux.sync umax %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_23:.*]] = nvvm.redux.sync umin %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_24:.*]] = nvvm.redux.sync xor %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_25:.*]] = nvvm.redux.sync fmin %[[ARG2]], %[[ARG1]] {nan = true} : f32 -> f32 +# CHECK: %[[REDUX_26:.*]] = nvvm.redux.sync fmax %[[ARG2]], %[[ARG1]] {nan = true} : f32 -> f32 +# CHECK: %[[REDUX_27:.*]] = nvvm.redux.sync and %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_28:.*]] = nvvm.redux.sync max %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_29:.*]] = nvvm.redux.sync min %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_30:.*]] = nvvm.redux.sync or %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_31:.*]] = nvvm.redux.sync umax %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_32:.*]] = nvvm.redux.sync umin %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_33:.*]] = nvvm.redux.sync xor %[[ARG1]], %[[ARG1]] : i32 -> i32 +# CHECK: %[[REDUX_34:.*]] = nvvm.redux.sync fmin %[[ARG2]], %[[ARG1]] : f32 -> f32 +# CHECK: %[[REDUX_35:.*]] = nvvm.redux.sync fmax %[[ARG2]], %[[ARG1]] : f32 -> f32 +# CHECK: return +# CHECK: }