Skip to content

Commit

Permalink
Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on …
Browse files Browse the repository at this point in the history
…targets using scratch instructions for stack #78226 (#81394)

Resolve #78226
  • Loading branch information
SahilPatidar committed Mar 21, 2024
1 parent 83e5a12 commit 3ac243b
Show file tree
Hide file tree
Showing 12 changed files with 2,069 additions and 2,560 deletions.
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,10 +715,13 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!IsEntryFunc && !IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}

if (!IsEntryFunc) {
if (!Subtarget.enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
if (!IsGraphics)
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}

IncomingValueAssigner Assigner(AssignFn);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def CC_SI_Gfx : CallingConv<[
// 33 is reserved for the frame pointer
// 34 is reserved for the base pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3,
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2857,12 +2857,13 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}

// FIXME: Sink this into allocateSpecialInputSGPRs
if (!IsEntryFunc) {
if (!Subtarget->enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());

allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
if (!IsGraphics)
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}

if (!IsKernel) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
Expand Down Expand Up @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
Expand Down Expand Up @@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
; GFX11-LABEL: test_inreg_arg_store:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store bfloat %in, ptr addrspace(1) %out
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test34:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_i32 s0, s4, s5
; GCN-NEXT: s_min_i32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_lt_i32 s0, 0x3e9
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test35:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_i32 s0, s4, s5
; GCN-NEXT: s_max_i32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e8
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test36:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_u32 s0, s4, s5
; GCN-NEXT: s_min_u32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_lt_u32 s0, s6
; GCN-NEXT: s_cmp_lt_u32 s0, s2
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand All @@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test37:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_i32 s0, s4, s5
; GCN-NEXT: s_max_i32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_ge_i32 s0, s6
; GCN-NEXT: s_cmp_ge_i32 s0, s2
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand All @@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test38:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_u32 s0, s4, s5
; GCN-NEXT: s_max_u32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_lt_u32 s0, 0x3e9
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test39:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_i32 s0, s4, s5
; GCN-NEXT: s_min_i32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e7
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test40:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_i32 s0, s4, s5
; GCN-NEXT: s_max_i32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_le_i32 s0, s6
; GCN-NEXT: s_cmp_le_i32 s0, s2
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand All @@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test41:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_u32 s0, s4, s5
; GCN-NEXT: s_min_u32 s0, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_ge_u32 s0, s6
; GCN-NEXT: s_cmp_ge_u32 s0, s2
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand Down
133 changes: 133 additions & 0 deletions llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2176,6 +2176,93 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr)
declare void @extern()

define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
; GFX9-LABEL: void_func_a13i32_inreg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s27, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[28:29]
; GFX9-NEXT: v_mov_b32_e32 v2, s26
; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48
; GFX9-NEXT: v_mov_b32_e32 v5, s25
; GFX9-NEXT: v_mov_b32_e32 v4, s24
; GFX9-NEXT: v_mov_b32_e32 v3, s23
; GFX9-NEXT: v_mov_b32_e32 v2, s22
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
; GFX9-NEXT: v_writelane_b32 v40, s27, 2
; GFX9-NEXT: v_mov_b32_e32 v5, s21
; GFX9-NEXT: v_mov_b32_e32 v4, s20
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_mov_b32_e32 v5, s17
; GFX9-NEXT: v_mov_b32_e32 v4, s16
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_a13i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s23, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_or_saveexec_b32 s24, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s24
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19
; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17
; GFX11-NEXT: s_getpc_b64 s[18:19]
; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7
; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0
; GFX11-NEXT: v_writelane_b32 v40, s23, 2
; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21
; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
store [13 x i32] %arg0, ptr addrspace(1) %ptr
call void @extern()
ret void
Expand Down Expand Up @@ -2203,6 +2290,52 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p

; FIXME: Should still fail
define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
; GFX9-LABEL: void_func_a16i32_inreg__noimplicit:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s19
; GFX9-NEXT: v_mov_b32_e32 v4, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s15
; GFX9-NEXT: v_mov_b32_e32 v4, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s12
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s11
; GFX9-NEXT: v_mov_b32_e32 v4, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_a16i32_inreg__noimplicit:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14
; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10
; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8
; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2
; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store [16 x i32] %arg0, ptr addrspace(1) %ptr
ret void
}
Expand Down
Loading

0 comments on commit 3ac243b

Please sign in to comment.