diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index d1252f4154713..bef9a27868143 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -312,6 +312,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 46cb8cc1312dc..5fce455e114cb 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -839,5 +839,12 @@ define amdgpu_cs void @call_whole_wave(ptr addrspace(1) %out) { declare amdgpu_gfx_whole_wave i32 @wwf(i1, i32) #0 +; CHECK: DIVERGENT: %v = call i1 @llvm.amdgcn.wqm.vote(i1 %c) +define amdgpu_kernel void @wqm_vote(i32 %a, i32 %b) #1 { + %c = icmp eq i32 %a, %b + %v = call i1 @llvm.amdgcn.wqm.vote(i1 %c) #1 + ret void +} + attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index e7ade6614795c..ce270ec2f24ac 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -41,9 +41,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 -; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX7-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB0_6 ; GFX7-NEXT: ; %bb.5: ; %if ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -76,9 +75,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 -; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX89-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX89-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX89-NEXT: s_cbranch_execz .LBB0_6 ; GFX89-NEXT: ; %bb.5: ; %if ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -112,9 +110,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX1064-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB0_6 ; GFX1064-NEXT: ; %bb.5: ; %if ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -147,9 +144,8 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 -; GFX1032-NEXT: s_and_b32 s4, s4, s4 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1032-NEXT: s_and_saveexec_b32 s5, s4 +; GFX1032-NEXT: s_cbranch_execz .LBB0_6 ; GFX1032-NEXT: ; %bb.5: ; %if ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -186,10 +182,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] -; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB0_6 ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -225,10 +220,9 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_b32 s4, s4, s4 -; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_saveexec_b32 s5, s4 +; GFX1132-NEXT: s_cbranch_execz .LBB0_6 ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock @@ -250,14 +244,14 @@ else: define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspace(8) inreg %inout, i32 %val) { ; GFX7-LABEL: add_i32_varying: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_wqm_b64 s[8:9], -1 ; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX7-NEXT: s_wqm_b64 s[4:5], -1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX7-NEXT: s_cbranch_execz .LBB1_2 ; GFX7-NEXT: ; %bb.1: ; %if ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7-NEXT: .LBB1_2: ; %else +; GFX7-NEXT: .LBB1_2: ; %UnifiedReturnBlock ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_varying: @@ -307,9 +301,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 -; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX8-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX8-NEXT: s_cbranch_execz .LBB1_6 ; GFX8-NEXT: ; %bb.5: ; %if ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX8-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -362,9 +355,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX9-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_6 ; GFX9-NEXT: ; %bb.5: ; %if ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX9-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -422,9 +414,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX1064-NEXT: s_cbranch_execz .LBB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if ; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1064-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -472,9 +463,8 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 -; GFX1032-NEXT: s_and_b32 s4, s4, s4 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 -; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1032-NEXT: s_and_saveexec_b32 s5, s4 +; GFX1032-NEXT: s_cbranch_execz .LBB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if ; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -542,10 +532,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] -; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX1164-NEXT: s_cbranch_execz .LBB1_6 ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock @@ -599,10 +588,9 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_b32 s4, s4, s4 -; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 -; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_saveexec_b32 s5, s4 +; GFX1132-NEXT: s_cbranch_execz .LBB1_6 ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll index f437cd2152f13..933a9678e2cee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.vote.ll @@ -1,16 +1,33 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=CHECK,WAVE64 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=CHECK,WAVE32 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=WAVE64 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=WAVE64 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s -;CHECK-LABEL: {{^}}ret: -;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1 -;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]] -;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]] -;CHECK: v_cndmask_b32_e64 v0, 0, 1.0, [[WQM]] define amdgpu_ps float @ret(i32 %v0, i32 %v1) #1 { +; WAVE64-LABEL: ret: +; WAVE64: ; %bb.0: ; %main_body +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; WAVE64-NEXT: s_wqm_b64 s[0:1], vcc +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; WAVE64-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: ret: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_wqm_b32 s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ret: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_wqm_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-NEXT: ; return to shader part epilog main_body: %c = icmp eq i32 %v0, %v1 %w = call i1 @llvm.amdgcn.wqm.vote(i1 %c) @@ -18,20 +35,50 @@ main_body: ret float %r } -;CHECK-LABEL: {{^}}true: -;WAVE64: s_wqm_b64 -;WAVE32: s_wqm_b32 define amdgpu_ps float @true() #1 { +; WAVE64-LABEL: true: +; WAVE64: ; %bb.0: ; %main_body +; WAVE64-NEXT: s_wqm_b64 s[0:1], -1 +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; WAVE64-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: true: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_wqm_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: true: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_wqm_b32 s0, -1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-NEXT: ; return to shader part epilog main_body: %w = call i1 @llvm.amdgcn.wqm.vote(i1 true) %r = select i1 %w, float 1.0, float 0.0 ret float %r } -;CHECK-LABEL: {{^}}false: -;WAVE64: s_wqm_b64 -;WAVE32: s_wqm_b32 define amdgpu_ps float @false() #1 { +; WAVE64-LABEL: false: +; WAVE64: ; %bb.0: ; %main_body +; WAVE64-NEXT: s_wqm_b64 s[0:1], 0 +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; WAVE64-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: false: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_wqm_b32 s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: false: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_wqm_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-NEXT: ; return to shader part epilog main_body: %w = call i1 @llvm.amdgcn.wqm.vote(i1 false) %r = select i1 %w, float 1.0, float 0.0 @@ -39,21 +86,61 @@ main_body: } ; Note: an almost identical test for this exists in llvm.amdgcn.kill.ll -;CHECK-LABEL: {{^}}kill: -;CHECK: v_cmp_eq_u32_e32 [[CMP:[^,]+]], v0, v1 - -;WAVE64: s_wqm_b64 [[WQM:[^,]+]], [[CMP]] -;WAVE64: s_andn2_b64 [[KILL:[^,]+]], exec, [[WQM]] -;WAVE64: s_andn2_b64 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]] -;WAVE64: s_and_b64 exec, exec, [[MASK]] - -;WAVE32: s_wqm_b32 [[WQM:[^,]+]], [[CMP]] -;WAVE32: s_and{{n2|_not1}}_b32 [[KILL:[^,]+]], exec_lo, [[WQM]] -;WAVE32: s_and{{n2|_not1}}_b32 [[MASK:[^,]+]], [[EXEC:[^,]+]], [[KILL]] -;WAVE32: s_and_b32 exec_lo, exec_lo, [[MASK]] - -;CHECK: s_endpgm define amdgpu_ps float @kill(i32 %v0, i32 %v1) #1 { +; WAVE64-LABEL: kill: +; WAVE64: ; %bb.0: ; %main_body +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; WAVE64-NEXT: s_wqm_b64 s[2:3], vcc +; WAVE64-NEXT: s_mov_b64 s[0:1], exec +; WAVE64-NEXT: s_andn2_b64 s[2:3], exec, s[2:3] +; WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; WAVE64-NEXT: s_cbranch_scc0 .LBB3_2 +; WAVE64-NEXT: ; %bb.1: ; %main_body +; WAVE64-NEXT: s_and_b64 exec, exec, s[0:1] +; WAVE64-NEXT: v_mov_b32_e32 v0, 0 +; WAVE64-NEXT: s_branch .LBB3_3 +; WAVE64-NEXT: .LBB3_2: +; WAVE64-NEXT: s_mov_b64 exec, 0 +; WAVE64-NEXT: exp null, off, off, off, off done vm +; WAVE64-NEXT: s_endpgm +; WAVE64-NEXT: .LBB3_3: +; +; GFX10-LABEL: kill: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_mov_b32 s0, exec_lo +; GFX10-NEXT: s_wqm_b32 s1, vcc_lo +; GFX10-NEXT: s_andn2_b32 s1, exec_lo, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s1 +; GFX10-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX10-NEXT: ; %bb.1: ; %main_body +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_branch .LBB3_3 +; GFX10-NEXT: .LBB3_2: +; GFX10-NEXT: s_mov_b32 exec_lo, 0 +; GFX10-NEXT: exp null, off, off, off, off done vm +; GFX10-NEXT: s_endpgm +; GFX10-NEXT: .LBB3_3: +; +; GFX11-LABEL: kill: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_wqm_b32 s1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, exec_lo, s1 +; GFX11-NEXT: s_and_not1_b32 s0, s0, s1 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %main_body +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_branch .LBB3_3 +; GFX11-NEXT: .LBB3_2: +; GFX11-NEXT: s_mov_b32 exec_lo, 0 +; GFX11-NEXT: exp mrt0, off, off, off, off done +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB3_3: main_body: %c = icmp eq i32 %v0, %v1 %w = call i1 @llvm.amdgcn.wqm.vote(i1 %c)