Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on targets using scratch instructions for stack #78226" #86273

Merged
merged 1 commit into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -715,13 +715,10 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!IsEntryFunc && !IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}

if (!IsEntryFunc) {
if (!Subtarget.enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
if (!IsGraphics)
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}

IncomingValueAssigner Assigner(AssignFn);
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def CC_SI_Gfx : CallingConv<[
// 33 is reserved for the frame pointer
// 34 is reserved for the base pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3,
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
Expand Down
7 changes: 3 additions & 4 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2857,13 +2857,12 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}

if (!IsEntryFunc) {
// FIXME: Sink this into allocateSpecialInputSGPRs
if (!Subtarget->enableFlatScratch())
CCInfo.AllocateReg(Info->getScratchRSrcReg());
if (!IsGraphics)
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);

allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}

if (!IsKernel) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
Expand Down Expand Up @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32)
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
Expand Down Expand Up @@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32)
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
; CHECK-NEXT: SI_RETURN
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
; GFX11-LABEL: test_inreg_arg_store:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store bfloat %in, ptr addrspace(1) %out
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test34:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_i32 s0, s0, s1
; GCN-NEXT: s_min_i32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_lt_i32 s0, 0x3e9
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test35:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_i32 s0, s0, s1
; GCN-NEXT: s_max_i32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e8
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test36:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_u32 s0, s0, s1
; GCN-NEXT: s_min_u32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_lt_u32 s0, s2
; GCN-NEXT: s_cmp_lt_u32 s0, s6
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand All @@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test37:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_i32 s0, s0, s1
; GCN-NEXT: s_max_i32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_ge_i32 s0, s2
; GCN-NEXT: s_cmp_ge_i32 s0, s6
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand All @@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test38:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_u32 s0, s0, s1
; GCN-NEXT: s_max_u32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_lt_u32 s0, 0x3e9
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
; GCN-LABEL: test39:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_i32 s0, s0, s1
; GCN-NEXT: s_min_i32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e7
; GCN-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test40:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_max_i32 s0, s0, s1
; GCN-NEXT: s_max_i32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_le_i32 s0, s2
; GCN-NEXT: s_cmp_le_i32 s0, s6
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand All @@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
; GCN-LABEL: test41:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_min_u32 s0, s0, s1
; GCN-NEXT: s_min_u32 s0, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_cmp_ge_u32 s0, s2
; GCN-NEXT: s_cmp_ge_u32 s0, s6
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_cselect_b32 s0, -1, 0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
Expand Down
133 changes: 0 additions & 133 deletions llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2176,93 +2176,6 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr)
declare void @extern()

define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
; GFX9-LABEL: void_func_a13i32_inreg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s27, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[28:29]
; GFX9-NEXT: v_mov_b32_e32 v2, s26
; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48
; GFX9-NEXT: v_mov_b32_e32 v5, s25
; GFX9-NEXT: v_mov_b32_e32 v4, s24
; GFX9-NEXT: v_mov_b32_e32 v3, s23
; GFX9-NEXT: v_mov_b32_e32 v2, s22
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
; GFX9-NEXT: v_writelane_b32 v40, s27, 2
; GFX9-NEXT: v_mov_b32_e32 v5, s21
; GFX9-NEXT: v_mov_b32_e32 v4, s20
; GFX9-NEXT: v_mov_b32_e32 v3, s19
; GFX9-NEXT: v_mov_b32_e32 v2, s18
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_mov_b32_e32 v5, s17
; GFX9-NEXT: v_mov_b32_e32 v4, s16
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_a13i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s23, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_or_saveexec_b32 s24, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s24
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19
; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17
; GFX11-NEXT: s_getpc_b64 s[18:19]
; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12
; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7
; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0
; GFX11-NEXT: v_writelane_b32 v40, s23, 2
; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21
; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
; GFX11-NEXT: v_mov_b32_e32 v10, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
store [13 x i32] %arg0, ptr addrspace(1) %ptr
call void @extern()
ret void
Expand Down Expand Up @@ -2290,52 +2203,6 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p

; FIXME: Should still fail
define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
; GFX9-LABEL: void_func_a16i32_inreg__noimplicit:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s19
; GFX9-NEXT: v_mov_b32_e32 v4, s18
; GFX9-NEXT: v_mov_b32_e32 v3, s17
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s15
; GFX9-NEXT: v_mov_b32_e32 v4, s14
; GFX9-NEXT: v_mov_b32_e32 v3, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s12
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s11
; GFX9-NEXT: v_mov_b32_e32 v4, s10
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NEXT: v_mov_b32_e32 v4, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_a16i32_inreg__noimplicit:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14
; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10
; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8
; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2
; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store [16 x i32] %arg0, ptr addrspace(1) %ptr
ret void
}
Expand Down
Loading
Loading