[AMDGPU] exp should not be in WQM mode

A mrt exp with vm=1 must be in exact (non-WQM) mode, as it also exports the exec mask as the valid mask to determine which pixels to render. This commit marks any exp as needing to be in exact mode. Actually, if there are multiple mrt exps, only one needs to have vm=1, and only that one needs to be in exact mode. But that is an optimization for another day. Differential Revision: https://reviews.llvm.org/D36305 llvm-svn: 312915
llvm · Sep 11, 2017 · 660ba2b · 660ba2b
1 parent 8a1c2b4
commit 660ba2b
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 15 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -849,7 +849,7 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
 // Split EXP instruction into EXP and EXP_DONE so we can set
 // mayLoad for done=1.
 multiclass EXP_m<bit done, SDPatternOperator node> {
-  let mayLoad = done in {
+  let mayLoad = done, DisableWQM = 1 in {
     let isPseudo = 1, isCodeGenOnly = 1 in {
       def "" : EXP_Helper<done, node>,
                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;

diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -66,21 +66,21 @@ endif:
 
 ; TOSMEM-NOT: s_m0
 ; TOSMEM: s_add_u32 m0, s7, 0x100
-; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
 ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
 ; FIXME-TOSMEM-NOT: m0
 
 ; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s7, 0x200
-; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
+; TOSMEM: s_add_u32 m0, s7, 0x300
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
 ; FIXME-TOSMEM-NOT: m0
 
 ; TOSMEM: s_mov_b64 exec,
 ; TOSMEM: s_cbranch_execz
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM: s_add_u32 m0, s7, 0x200
+; TOSMEM: s_add_u32 m0, s7, 0x400
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 

diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -12,24 +12,33 @@ main_body:
   ret <4 x float> %tex
 }
 
-; Check that WQM is triggered by image samples and left untouched for loads...
+; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
 ;
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: interp
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NOT: interp
+;CHECK: image_sample
 ;CHECK-NOT: exec
-define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) {
+;CHECK: .size test2
+define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
 main_body:
-  %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
-  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
-  %c.3 = extractelement <4 x i32> %c.2, i32 0
-  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
-  %data = load float, float addrspace(1)* %gep
-  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
-  ret void
+  %inst23 = extractelement <2 x float> %pos, i32 0
+  %inst24 = extractelement <2 x float> %pos, i32 1
+  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
+  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
+  %inst27 = insertelement <2 x float> undef, float %inst26, i32 0
+  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
+  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
+  %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  ret <4 x float> %tex
 }
 
-; ... but disabled for stores (and, in this simple case, not re-enabled).
+; ... but disabled for stores (and, in this simple case, not re-enabled) ...
 ;
 ;CHECK-LABEL: {{^}}test3:
 ;CHECK-NEXT: ; %main_body
@@ -51,6 +60,36 @@ main_body:
   ret <4 x float> %tex
 }
 
+; ... and disabled for export.
+;
+;CHECK-LABEL: {{^}}test3x:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
+;CHECK: exp
+;CHECK-NOT: exec
+;CHECK: .size test3x
+define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
+main_body:
+  %inst23 = extractelement <2 x float> %pos, i32 0
+  %inst24 = extractelement <2 x float> %pos, i32 1
+  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
+  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
+  %inst27 = insertelement <2 x float> undef, float %inst26, i32 0
+  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
+  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
+  %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %tex.0 = extractelement <4 x float> %tex, i32 0
+  %tex.1 = extractelement <4 x float> %tex, i32 1
+  %tex.2 = extractelement <4 x float> %tex, i32 2
+  %tex.3 = extractelement <4 x float> %tex, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
+  ret void
+}
+
 ; Check that WQM is re-enabled when required.
 ;
 ;CHECK-LABEL: {{^}}test4:
@@ -724,9 +763,14 @@ declare i32 @llvm.amdgcn.wwm.i32(i32) #3
 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind readnone convergent }
 attributes #5 = { "amdgpu-ps-wqm-outputs" }
+attributes #6 = { nounwind "InitialPSInputAddr"="2" }