diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 61db779ae0b49..602100f3b9b83 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -817,7 +817,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); - addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_SWAP}, Standard) + addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX, + G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_SMAX, + G_AMDGPU_BUFFER_ATOMIC_SMIN}, + Standard) .Div(S32, {{Vgpr32}, {Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}) .Div(S64, {{Vgpr64}, {Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.integer-minmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.integer-minmax.ll new file mode 100644 index 0000000000000..d3cd8839d127b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.integer-minmax.ll @@ -0,0 +1,282 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -debug -global-isel -mtriple=amdgcn -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck %s + +define amdgpu_ps float @test1(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test1: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_smin v0, v1, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps float @test2(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_smax v0, v1, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps float @test3(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test3: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_umin v0, v1, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps float @test4(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test4: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_umax v0, v1, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + + +define amdgpu_ps <2 x float> @test5(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test5: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_smin_x2 v[0:1], v2, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.smin.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @test6(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test6: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_smax_x2 v[0:1], v2, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.smax.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @test7(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test7: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_umin_x2 v[0:1], v2, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @test8(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: test8: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], s4 offen glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.umax.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps float @wf_test1(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: wf_test1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s2, exec_lo +; CHECK-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v1 +; CHECK-NEXT: v_readfirstlane_b32 s5, v2 +; CHECK-NEXT: v_readfirstlane_b32 s6, v3 +; CHECK-NEXT: v_readfirstlane_b32 s7, v4 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; CHECK-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] +; CHECK-NEXT: s_and_b32 s1, vcc_lo, s1 +; CHECK-NEXT: s_and_saveexec_b32 s1, s1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_atomic_smin v0, v5, s[4:7], s0 offen glc +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 +; CHECK-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s1 +; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b32 exec_lo, s2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps <2 x float> @wf_test2(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 inreg %soffset) { +; CHECK-LABEL: wf_test2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s2, exec_lo +; CHECK-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[4:5] +; CHECK-NEXT: s_and_b32 s1, vcc_lo, s1 +; CHECK-NEXT: s_and_saveexec_b32 s1, s1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_atomic_smax_x2 v[0:1], v6, s[4:7], s0 offen glc +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s1 +; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b32 exec_lo, s2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.smax.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +; Waterfall tests - divergent soffset (VGPR soffset) +define amdgpu_ps float @wf_test3(i32 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 %soffset) { +; CHECK-LABEL: wf_test3: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s6, v2 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s6, v2 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_atomic_umin v0, v1, s[0:3], s6 offen glc +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_cbranch_execnz .LBB10_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b32 exec_lo, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps <2 x float> @wf_test5(i64 %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 %soffset) { +; CHECK-LABEL: wf_test5: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s4, exec_lo +; CHECK-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s6, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s6, v3 +; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], s6 offen glc +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s5 +; CHECK-NEXT: s_cbranch_execnz .LBB11_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b32 exec_lo, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.umax.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + +; Waterfall tests - both divergent (VGPR rsrc and soffset) +define amdgpu_ps float @wf_test6(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +; CHECK-LABEL: wf_test6: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s2, exec_lo +; CHECK-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v1 +; CHECK-NEXT: v_readfirstlane_b32 s5, v2 +; CHECK-NEXT: v_readfirstlane_b32 s6, v3 +; CHECK-NEXT: v_readfirstlane_b32 s7, v4 +; CHECK-NEXT: v_readfirstlane_b32 s3, v6 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; CHECK-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] +; CHECK-NEXT: v_cmp_eq_u32_e64 s1, s3, v6 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 +; CHECK-NEXT: s_and_b32 s0, s0, s1 +; CHECK-NEXT: s_and_saveexec_b32 s0, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_atomic_smin v0, v5, s[4:7], s3 offen glc +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr5 +; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4 +; CHECK-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execnz .LBB12_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b32 exec_lo, s2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i32 %ret to float + ret float %cast +} + +define amdgpu_ps <2 x float> @wf_test7(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset) { +; CHECK-LABEL: wf_test7: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s2, exec_lo +; CHECK-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-NEXT: v_readfirstlane_b32 s5, v3 +; CHECK-NEXT: v_readfirstlane_b32 s6, v4 +; CHECK-NEXT: v_readfirstlane_b32 s7, v5 +; CHECK-NEXT: v_readfirstlane_b32 s3, v7 +; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 +; CHECK-NEXT: s_and_b32 s0, s0, s1 +; CHECK-NEXT: s_and_saveexec_b32 s0, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_atomic_umin_x2 v[0:1], v6, s[4:7], s3 offen glc +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; implicit-def: $vgpr7 +; CHECK-NEXT: ; implicit-def: $vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execnz .LBB13_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b32 exec_lo, s2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog + %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %cast +} + + +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32 immarg) #0 + +declare i64 @llvm.amdgcn.raw.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.raw.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0 +declare i64 @llvm.amdgcn.raw.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i32 immarg) #0 + +attributes #0 = { nounwind }