diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll index 523d51ddcd2bc..fc236147f1238 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll @@ -1,12 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -global-isel %s -o - 2>&1 | FileCheck %s ; This file checks that the translation from llvm IR to generic ; MachineInstr is correct. ; Tests for add. -; CHECK: name: addi32 -; CHECK: {{%[0-9]+}}:_(s32) = G_ADD -define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) { +define void @addi32(i32 %arg1, i32 %arg2) { + ; CHECK-LABEL: name: addi32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: G_STORE [[ADD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; CHECK-NEXT: SI_RETURN %res = add i32 %arg1, %arg2 store i32 %res, ptr addrspace(1) poison ret void diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll index eedd56de8c7c2..a8560e8498c4d 100644 --- a/llvm/test/CodeGen/AMDGPU/add_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll @@ -1,12 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.amdgcn.workitem.id.x() readnone - -; SI-LABEL: {{^}}test_i64_vreg: -; SI: v_add_i32 -; SI: v_addc_u32 define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { +; SI-LABEL: test_i64_vreg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid @@ -18,10 +35,22 @@ define amdgpu_kernel void @test_i64_vreg(ptr addrspace(1) noalias %out, ptr addr } ; Check that the SGPR add operand is correctly moved to a VGPR. -; SI-LABEL: {{^}}sgpr_operand: -; SI: s_add_u32 -; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in_bar, i64 %a) { +; SI-LABEL: sgpr_operand: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_u32 s4, s6, s4 +; SI-NEXT: s_addc_u32 s5, s7, s5 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %foo = load i64, ptr addrspace(1) %in, align 8 %result = add i64 %foo, %a store i64 %result, ptr addrspace(1) %out @@ -30,35 +59,76 @@ define amdgpu_kernel void @sgpr_operand(ptr addrspace(1) noalias %out, ptr addrs ; Swap the arguments. Check that the SGPR -> VGPR copy works with the ; SGPR as other operand. -; -; SI-LABEL: {{^}}sgpr_operand_reversed: -; SI: s_add_u32 -; SI: s_addc_u32 define amdgpu_kernel void @sgpr_operand_reversed(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %a) { +; SI-LABEL: sgpr_operand_reversed: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_u32 s4, s4, s6 +; SI-NEXT: s_addc_u32 s5, s5, s7 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %foo = load i64, ptr addrspace(1) %in, align 8 %result = add i64 %a, %foo store i64 %result, ptr addrspace(1) %out ret void } - -; SI-LABEL: {{^}}test_v2i64_sreg: -; SI: s_add_u32 -; SI: s_addc_u32 -; SI: s_add_u32 -; SI: s_addc_u32 define amdgpu_kernel void @test_v2i64_sreg(ptr addrspace(1) noalias %out, <2 x i64> %a, <2 x i64> %b) { +; SI-LABEL: test_v2i64_sreg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_u32 s4, s10, s14 +; SI-NEXT: s_addc_u32 s5, s11, s15 +; SI-NEXT: s_add_u32 s6, s8, s12 +; SI-NEXT: s_addc_u32 s7, s9, s13 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = add <2 x i64> %a, %b store <2 x i64> %result, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}test_v2i64_vreg: -; SI: v_add_i32 -; SI: v_addc_u32 -; SI: v_add_i32 -; SI: v_addc_u32 define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { +; SI-LABEL: test_v2i64_vreg: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b64 s[6:7], s[14:15] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -69,14 +139,19 @@ define amdgpu_kernel void @test_v2i64_vreg(ptr addrspace(1) noalias %out, ptr ad ret void } -; SI-LABEL: {{^}}trunc_i64_add_to_i32: -; SI: s_load_dword s[[SREG0:[0-9]+]] -; SI: s_load_dword s[[SREG1:[0-9]+]] -; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] -; SI-NOT: addc -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], define amdgpu_kernel void @trunc_i64_add_to_i32(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b) { +; SI-LABEL: trunc_i64_add_to_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s2, s[4:5], 0xd +; SI-NEXT: s_load_dword s6, s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s4, s6, s2 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %add = add i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll index 45192be37c242..9a4040a25419a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -1,24 +1,94 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI,SICI,SICIVI,PREGFX11,GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=SICI,CIVI,SICIVI,PREGFX11,GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CIVI,SICIVI,GFX8PLUS,PREGFX11,GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9PLUS,GFX8PLUS,PREGFX11,GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11,GFX9PLUS,GFX8PLUS,GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GFX9PLUS-NOT: m0 -; SICIVI-DAG: s_mov_b32 m0 - -; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 -; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; PREGFX11: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 -; GFX11: ds_cmpstore_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VSWAP]], [[VCMP]] offset:16 -; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %out, [8 x i32], ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind { +; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_offset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x13 +; CHECK-NEXT: s_load_dword s3, s[4:5], 0x1c +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: v_mov_b32_e32 v0, 7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v2, s3 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +; +; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x13 +; GFX7-NEXT: s_load_dword s3, s[4:5], 0x1c +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: v_mov_b32_e32 v0, 7 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x4c +; GFX8-NEXT: s_load_dword s3, s[4:5], 0x70 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 7 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v1, v0, v2 offset:16 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4c +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x70 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x4c +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x70 +; GFX11-NEXT: v_mov_b32_e32 v0, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 @@ -26,24 +96,100 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(ptr addrspace(1) %o ret void } -; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: -; GFX9PLUS-NOT: m0 -; SICIVI-DAG: s_mov_b32 m0 - -; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] -; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; PREGFX11: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32 -; GFX11: ds_cmpstore_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32 -; GCN: [[RESULT]] -; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i64 %swap) nounwind { +; CHECK-LABEL: lds_atomic_cmpxchg_ret_i64_offset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: v_mov_b32_e32 v0, 7 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +; +; GFX7-LABEL: lds_atomic_cmpxchg_ret_i64_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX7-NEXT: v_mov_b32_e32 v0, 7 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_atomic_cmpxchg_ret_i64_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 7 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_cmpxchg_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v4, v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_cmpxchg_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 7 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_cmpstore_rtn_b64 v[0:1], v4, v[2:3], v[0:1] offset:32 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic %result = extractvalue { i64, i1 } %pair, 0 @@ -51,13 +197,103 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(ptr addrspace(1) %o ret void } -; GCN-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset -; GFX9PLUS-NOT: m0 -; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GFX9PLUS: ds_{{cmpst|cmpstore}}_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %swap, i32 %a, i32 %b) nounwind { +; CHECK-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_load_dword s3, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; CHECK-NEXT: s_sub_i32 s1, s1, s2 +; CHECK-NEXT: s_lshl_b32 s1, s1, 2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_i32 s1, s3, s1 +; CHECK-NEXT: s_add_i32 s1, s1, 16 +; CHECK-NEXT: v_mov_b32_e32 v0, 7 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s1 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_cmpst_rtn_b32 v0, v2, v0, v1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CHECK-NEXT: s_endpgm +; +; GFX7-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX7-NEXT: v_mov_b32_e32 v0, 7 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_sub_i32 s2, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshl_b32 s1, s2, 2 +; GFX7-NEXT: s_add_i32 s0, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v2, v0, v1 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 7 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, s2, 2 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v2, v0, v1 offset:16 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s2, s2, 2 +; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: ds_cmpst_rtn_b32 v0, v1, v0, v2 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_cmpxchg_ret_i32_bad_si_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: v_mov_b32_e32 v0, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_i32 s2, s2, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: s_lshl_b32 s2, s2, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_cmpstore_rtn_b32 v0, v1, v2, v0 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %sub = sub i32 %a, %b %add = add i32 %sub, 4 %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 %add @@ -67,45 +303,152 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(ptr addrspac ret void } -; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: -; GFX9PLUS-NOT: m0 -; SICIVI-DAG: s_mov_b32 m0 - - -; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12 -; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GFX8PLUS-DAG: s_load_{{dword|b32}} [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48 -; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; PREGFX11: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 -; GFX11: ds_cmpstore_b32 [[VPTR]], [[VSWAP]], [[VCMP]] offset:16 -; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(ptr addrspace(3) %ptr, [8 x i32], i32 %swap) nounwind { +; CHECK-LABEL: lds_atomic_cmpxchg_noret_i32_offset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x9 +; CHECK-NEXT: s_load_dword s1, s[4:5], 0x12 +; CHECK-NEXT: v_mov_b32_e32 v0, 7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s1 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_endpgm +; +; GFX7-LABEL: lds_atomic_cmpxchg_noret_i32_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[4:5], 0x9 +; GFX7-NEXT: s_load_dword s1, s[4:5], 0x12 +; GFX7-NEXT: v_mov_b32_e32 v0, 7 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_atomic_cmpxchg_noret_i32_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x48 +; GFX8-NEXT: v_mov_b32_e32 v0, 7 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_cmpxchg_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x48 +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: ds_cmpst_b32 v1, v0, v2 offset:16 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_cmpxchg_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x48 +; GFX11-NEXT: v_mov_b32_e32 v0, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: ds_cmpstore_b32 v1, v2, v0 offset:16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 %pair = cmpxchg ptr addrspace(3) %gep, i32 7, i32 %swap seq_cst monotonic %result = extractvalue { i32, i1 } %pair, 0 ret void } -; GCN-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: -; GFX9PLUS-NOT: m0 -; SICIVI-DAG: s_mov_b32 m0 - -; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI-DAG: s_load_dwordx2 s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; GFX8PLUS-DAG: s_load_{{dword|b32}} [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GFX8PLUS-DAG: s_load_{{dwordx2|b64}} s[[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] -; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; PREGFX11: ds_cmpst_b64 [[VPTR]], v[[[LOVCMP]]:[[HIVCMP]]], v[[[LOSWAPV]]:[[HISWAPV]]] offset:32 -; GFX11: ds_cmpstore_b64 [[VPTR]], v[[[LOSWAPV]]:[[HISWAPV]]], v[[[LOVCMP]]:[[HIVCMP]]] offset:32 -; GCN: s_endpgm define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(ptr addrspace(3) %ptr, i64 %swap) nounwind { +; CHECK-LABEL: lds_atomic_cmpxchg_noret_i64_offset: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; CHECK-NEXT: v_mov_b32_e32 v0, 7 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_endpgm +; +; GFX7-LABEL: lds_atomic_cmpxchg_noret_i64_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb +; GFX7-NEXT: v_mov_b32_e32 v0, 7 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_atomic_cmpxchg_noret_i64_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8-NEXT: v_mov_b32_e32 v0, 7 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_cmpxchg_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_cmpst_b64 v4, v[0:1], v[2:3] offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_cmpxchg_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX11-NEXT: v_mov_b32_e32 v0, 7 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: ds_cmpstore_b64 v4, v[2:3], v[0:1] offset:32 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 %pair = cmpxchg ptr addrspace(3) %gep, i64 7, i64 %swap seq_cst monotonic %result = extractvalue { i64, i1 } %pair, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index a9a64318cee0f..18afb7c677207 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s @@ -7,123 +8,210 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone -; GCN-LABEL: {{^}}test_div_fmas_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 - -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 - -; GCN-DAG: s_bitcmp1_b32 s{{[0-9]+}}, 0 - -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x13 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e +; GCN-NEXT: s_load_dword s7, s[4:5], 0x1c +; GCN-NEXT: s_load_dword s4, s[4:5], 0x25 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25 -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] -; SI: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_inline_imm_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e +; GCN-NEXT: s_load_dword s6, s[4:5], 0x1c +; GCN-NEXT: s_load_dword s4, s[4:5], 0x25 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd - -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 - -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_inline_imm_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x16 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c - -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70 - -; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 -; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_inline_imm_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e +; GCN-NEXT: s_load_dword s6, s[4:5], 0x13 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x1c +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f64: -; GCN: v_div_fmas_f64 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s8, s[4:5], 0x11 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s8, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s9, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone store double %result, ptr addrspace(1) %out, align 8 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: -; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} -; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind { +; GCN-LABEL: test_div_fmas_f32_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: -; GCN: s_mov_b64 vcc, 0 -; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind { +; GCN-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_mov_b64 vcc, 0 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: -; GCN: s_mov_b64 vcc, -1 -; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind { +; GCN-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_mov_b64 vcc, -1 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}} - -; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} -; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}} -; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0 -; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] -; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] -; SI: s_endpgm define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_logical_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 offset:4 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_and_b64 vcc, vcc, s[2:3] +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, s7 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v3, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1 @@ -143,26 +231,39 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: - -; SI: ; %entry -; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}} -; SI: s_mov_b64 vcc, 0 -; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]] - -; SI: ; %bb -; SI: buffer_load_dword [[LOAD:v[0-9]+]], -; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]] -; SI: s_and_b64 vcc, vcc, exec - -; SI: ; %exit -; SI: s_or_b64 exec, exec, [[SAVE]] -; SI-NOT: vcc -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: buffer_store_dword -; SI: s_endpgm - define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind { +; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] +; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GCN-NEXT: s_mov_b64 vcc, 0 +; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %bb +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, s3 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_b64 vcc, vcc, exec +; GCN-NEXT: .LBB9_2: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; GCN-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2 @@ -188,3 +289,5 @@ exit: store float %result, ptr addrspace(1) %gep.out, align 4 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}}