Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AMDGPU] Mark GFX11 dual source blend export as strict-wqm
The instructions that generate the source of dual source blend export should run in strict-wqm. That is if any lane in a quad is active, we need to enable all four lanes of that quad to make the shuffling operation before exporting to dual source blend target work correctly. Differential Revision: https://reviews.llvm.org/D127981
- Loading branch information
Showing
3 changed files
with
124 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN | ||
|
||
; This is a slightly modified IR from real case to make it concise. | ||
define amdgpu_ps void @_amdgpu_ps_main(i32 inreg %PrimMask, <2 x float> %InterpCenter) #0 { | ||
; GCN-LABEL: _amdgpu_ps_main: | ||
; GCN: ; %bb.0: ; %.entry | ||
; GCN-NEXT: s_mov_b32 s1, exec_lo | ||
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo | ||
; GCN-NEXT: s_mov_b32 m0, s0 | ||
; GCN-NEXT: v_mov_b32_e32 v2, v0 | ||
; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 | ||
; GCN-NEXT: lds_param_load v4, attr1.y wait_vdst:15 | ||
; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15 | ||
; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15 | ||
; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 | ||
; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7 | ||
; GCN-NEXT: v_and_b32_e32 v7, 1, v7 | ||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 | ||
; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2 | ||
; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1 | ||
; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6 | ||
; GCN-NEXT: v_interp_p10_f32 v2, v3, v2, v3 wait_exp:7 | ||
; GCN-NEXT: v_interp_p2_f32 v4, v4, v1, v8 wait_exp:7 | ||
; GCN-NEXT: v_interp_p2_f32 v5, v5, v1, v10 wait_exp:7 | ||
; GCN-NEXT: v_interp_p2_f32 v6, v6, v1, v9 wait_exp:7 | ||
; GCN-NEXT: v_interp_p2_f32 v2, v3, v1, v2 wait_exp:7 | ||
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] | ||
; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6] | ||
; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo | ||
; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo | ||
; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo | ||
; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo | ||
; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] | ||
; GCN-NEXT: v_mov_b32_dpp v5, v5 dpp8:[1,0,3,2,5,4,7,6] | ||
; GCN-NEXT: s_mov_b32 exec_lo, s1 | ||
; GCN-NEXT: exp dual_src_blend0 v3, v2, off, off | ||
; GCN-NEXT: exp dual_src_blend1 v4, v5, off, off done | ||
; GCN-NEXT: s_endpgm | ||
.entry: | ||
%InterpCenter.i0 = extractelement <2 x float> %InterpCenter, i64 0 | ||
%InterpCenter.i1 = extractelement <2 x float> %InterpCenter, i64 1 | ||
%i6 = call float @llvm.amdgcn.lds.param.load(i32 immarg 0, i32 immarg 1, i32 %PrimMask) | ||
%i7 = call float @llvm.amdgcn.lds.param.load(i32 immarg 1, i32 immarg 1, i32 %PrimMask) | ||
%i8 = call float @llvm.amdgcn.lds.param.load(i32 immarg 2, i32 immarg 1, i32 %PrimMask) | ||
%i9 = call float @llvm.amdgcn.lds.param.load(i32 immarg 3, i32 immarg 1, i32 %PrimMask) | ||
|
||
%i14 = call float @llvm.amdgcn.interp.inreg.p10(float %i8, float %InterpCenter.i0, float %i8) | ||
%i15 = call float @llvm.amdgcn.interp.inreg.p2(float %i8, float %InterpCenter.i1, float %i14) | ||
|
||
%i16 = call float @llvm.amdgcn.interp.inreg.p10(float %i7, float %InterpCenter.i0, float %i7) | ||
%i17 = call float @llvm.amdgcn.interp.inreg.p2(float %i7, float %InterpCenter.i1, float %i16) | ||
|
||
%i18 = call float @llvm.amdgcn.interp.inreg.p10(float %i6, float %InterpCenter.i0, float %i6) | ||
%i19 = call float @llvm.amdgcn.interp.inreg.p2(float %i6, float %InterpCenter.i1, float %i18) | ||
|
||
%i20 = call float @llvm.amdgcn.interp.inreg.p10(float %i9, float %InterpCenter.i0, float %i9) | ||
%i21 = call float @llvm.amdgcn.interp.inreg.p2(float %i9, float %InterpCenter.i1, float %i20) | ||
|
||
%i34 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) | ||
%i35 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %i34) | ||
%i36 = and i32 %i35, 1 | ||
%.not = icmp eq i32 %i36, 0 | ||
|
||
%i37 = bitcast float %i15 to i32 | ||
%i38 = bitcast float %i17 to i32 | ||
%i39 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %i38, i32 14570689) | ||
%i40 = select i1 %.not, i32 %i37, i32 %i39 | ||
%i41 = bitcast i32 %i40 to float | ||
%i42 = select i1 %.not, i32 %i39, i32 %i37 | ||
%i43 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %i42, i32 14570689) | ||
%i44 = bitcast i32 %i43 to float | ||
|
||
%i45 = bitcast float %i19 to i32 | ||
%i46 = bitcast float %i21 to i32 | ||
%i47 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %i46, i32 14570689) | ||
%i48 = select i1 %.not, i32 %i45, i32 %i47 | ||
%i49 = bitcast i32 %i48 to float | ||
%i50 = select i1 %.not, i32 %i47, i32 %i45 | ||
%i51 = call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %i50, i32 14570689) | ||
%i52 = bitcast i32 %i51 to float | ||
call void @llvm.amdgcn.exp.f32(i32 immarg 21, i32 immarg 3, float %i41, float %i49, float undef, float undef, i1 immarg false, i1 immarg true) | ||
call void @llvm.amdgcn.exp.f32(i32 immarg 22, i32 immarg 3, float %i44, float %i52, float undef, float undef, i1 immarg true, i1 immarg true) | ||
ret void | ||
} | ||
|
||
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #2 | ||
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #2 | ||
declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32 immarg) #3 | ||
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #4 | ||
declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #1 | ||
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #1 | ||
declare float @llvm.amdgcn.lds.param.load(i32 immarg, i32 immarg, i32) #1 | ||
|
||
attributes #0 = { nounwind } | ||
attributes #1 = { nounwind readnone speculatable willreturn } | ||
attributes #2 = { nounwind readnone willreturn } | ||
attributes #3 = { convergent nounwind readnone willreturn } | ||
attributes #4 = { inaccessiblememonly nounwind willreturn writeonly } |