diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll index 787a15b6dc7275..efdae50e1f127e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -1,13 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s ; Check that WQM is not triggered by the softwqm intrinsic alone. ; -;CHECK-LABEL: {{^}}test1: -;CHECK-NOT: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) { +; CHECK-LABEL: test1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v0, v0, v1 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -18,12 +24,17 @@ main_body: ; Check that the softwqm intrinsic works correctly for integers. ; -;CHECK-LABEL: {{^}}test2: -;CHECK-NOT: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v0, v0, v1 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -36,14 +47,20 @@ main_body: ; Make sure the transition from WQM to Exact to softwqm does not trigger WQM. ; -;CHECK-LABEL: {{^}}test_softwqm1: -;CHECK-NOT: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: buffer_store_dword -;CHECK-NOT; s_wqm_b64 exec, exec -;CHECK: v_add_f32_e32 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { +; CHECK-LABEL: test_softwqm1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s1 +; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v1, v1, v2 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; CHECK-NEXT: v_add_f32_e32 v0, v1, v1 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -56,16 +73,26 @@ main_body: ; Make sure the transition from WQM to Exact to softwqm does trigger WQM. ; -;CHECK-LABEL: {{^}}test_softwqm2: -;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK: s_wqm_b64 exec, exec -;CHECK: buffer_load_dword -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: v_add_f32_e32 -;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: buffer_store_dword define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { +; CHECK-LABEL: test_softwqm2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b64 s[2:3], exec +; CHECK-NEXT: s_wqm_b64 exec, exec +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v2, s1 +; CHECK-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v1, v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_add_f32_e32 v1, v1, v1 +; CHECK-NEXT: s_and_b64 exec, exec, s[2:3] +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_wqm_b64 exec, exec +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: s_and_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) @@ -80,17 +107,25 @@ main_body: ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM. ; -;CHECK-LABEL: {{^}}test_wwm1: -;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG0]] -;CHECK: buffer_store_dword -;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG1]] -;CHECK-NOT: s_wqm_b64 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +; CHECK-LABEL: test_wwm1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b64 exec, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v1, v2, v1 +; CHECK-NEXT: s_mov_b64 exec, s[2:3] +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -104,17 +139,25 @@ main_body: ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM. ; -;CHECK-LABEL: {{^}}test_strict_wwm1: -;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: s_mov_b64 exec, [[ORIG0]] -;CHECK: buffer_store_dword -;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1 -;CHECK: buffer_load_dword -;CHECK: v_add_f32_e32 -;CHECK: s_mov_b64 exec, [[ORIG1]] -;CHECK-NOT: s_wqm_b64 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +; CHECK-LABEL: test_strict_wwm1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b64 exec, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v1, v2, v1 +; CHECK-NEXT: s_mov_b64 exec, s[2:3] +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec +; CHECK-NEXT: ; return to shader part epilog main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) @@ -129,15 +172,32 @@ main_body: ; Check that softwqm on one case of branch does not trigger WQM for shader. ; -;CHECK-LABEL: {{^}}test_control_flow_0: -;CHECK-NEXT: ; %main_body -;CHECK-NOT: s_wqm_b64 exec, exec -;CHECK: %ELSE -;CHECK: store -;CHECK: %IF -;CHECK: buffer_load -;CHECK: buffer_load define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { +; CHECK-LABEL: test_control_flow_0: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc +; CHECK-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: ; %bb.1: ; %ELSE +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen +; CHECK-NEXT: .LBB6_2: ; %Flow +; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] +; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_cbranch_execz .LBB6_4 +; CHECK-NEXT: ; %bb.3: ; %IF +; CHECK-NEXT: v_mov_b32_e32 v0, s12 +; CHECK-NEXT: v_mov_b32_e32 v1, s13 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec +; CHECK-NEXT: .LBB6_4: ; %END +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v0, v2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -160,20 +220,41 @@ END: ; Check that softwqm on one case of branch is treated as WQM in WQM shader. ; -;CHECK-LABEL: {{^}}test_control_flow_1: -;CHECK-NEXT: ; %main_body -;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec -;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: %ELSE -;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] -;CHECK: store -;CHECK: s_mov_b64 exec, [[SAVED]] -;CHECK: %IF -;CHECK-NOT: s_and_saveexec_b64 -;CHECK-NOT: s_and_b64 exec -;CHECK: buffer_load -;CHECK: buffer_load define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { +; CHECK-LABEL: test_control_flow_1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b64 s[14:15], exec +; CHECK-NEXT: s_wqm_b64 exec, exec +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: s_and_saveexec_b64 s[16:17], vcc +; CHECK-NEXT: s_xor_b64 s[16:17], exec, s[16:17] +; CHECK-NEXT: s_cbranch_execz .LBB7_2 +; CHECK-NEXT: ; %bb.1: ; %ELSE +; CHECK-NEXT: image_sample v1, v0, s[0:7], s[8:11] dmask:0x1 +; CHECK-NEXT: s_and_saveexec_b64 s[18:19], s[14:15] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: .LBB7_2: ; %Flow +; CHECK-NEXT: s_or_saveexec_b64 s[0:1], s[16:17] +; CHECK-NEXT: s_xor_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_cbranch_execz .LBB7_4 +; CHECK-NEXT: ; %bb.3: ; %IF +; CHECK-NEXT: v_mov_b32_e32 v0, s12 +; CHECK-NEXT: v_mov_b32_e32 v1, s13 +; CHECK-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec +; CHECK-NEXT: .LBB7_4: ; %END +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: s_and_b64 exec, exec, s[14:15] +; CHECK-NEXT: v_mov_b32_e32 v0, v2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %c.bc = bitcast i32 %c to float %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 @@ -205,7 +286,6 @@ declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 -declare void @llvm.amdgcn.kill(i1) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare float @llvm.amdgcn.softwqm.f32(float) #3 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3