diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 8969b3b5b6ce0..7e1f041fa1093 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -715,13 +715,10 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!IsEntryFunc && !IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); - } - if (!IsEntryFunc) { if (!Subtarget.enableFlatScratch()) CCInfo.AllocateReg(Info->getScratchRSrcReg()); - if (!IsGraphics) - TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); + TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } IncomingValueAssigner Assigner(AssignFn); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 9bd0fd7eca76f..4be64629ddac8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -23,7 +23,6 @@ def CC_SI_Gfx : CallingConv<[ // 33 is reserved for the frame pointer // 34 is reserved for the base pointer CCIfInRegenableFlatScratch()) CCInfo.AllocateReg(Info->getScratchRSrcReg()); - if (!IsGraphics) - allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); + + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } if (!IsKernel) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll index fad833c0a6ad5..5effd24a75208 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll @@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) @@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 75670604baa1a..392b0ae6823e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg - ; CHECK-NEXT: $sgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[C]](s32) ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42) @@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() # ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16) - ; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32) - ; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32) + ; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32) + ; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32) ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: SI_RETURN %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index e369f7e3b9a5a..98658834e8978 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX11-LABEL: test_inreg_arg_store: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-NEXT: s_setpc_b64 s[30:31] store bfloat %in, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index e1e3220cc2755..10d71a315fbf9 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test34: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_i32 s0, s0, s1 +; GCN-NEXT: s_min_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_lt_i32 s0, 0x3e9 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test35: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s0, s1 +; GCN-NEXT: s_max_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e8 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test36: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_u32 s0, s0, s1 +; GCN-NEXT: s_min_u32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_lt_u32 s0, s2 +; GCN-NEXT: s_cmp_lt_u32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test37: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s0, s1 +; GCN-NEXT: s_max_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_ge_i32 s0, s2 +; GCN-NEXT: s_cmp_ge_i32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test38: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_u32 s0, s0, s1 +; GCN-NEXT: s_max_u32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_lt_u32 s0, 0x3e9 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) { ; GCN-LABEL: test39: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_i32 s0, s0, s1 +; GCN-NEXT: s_min_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e7 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test40: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_max_i32 s0, s0, s1 +; GCN-NEXT: s_max_i32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_le_i32 s0, s2 +; GCN-NEXT: s_cmp_le_i32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 @@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3 ; GCN-LABEL: test41: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_min_u32 s0, s0, s1 +; GCN-NEXT: s_min_u32 s0, s4, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_cmp_ge_u32 s0, s2 +; GCN-NEXT: s_cmp_ge_u32 s0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 27845b6b5b2fe..44a9127b4bd09 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -2176,93 +2176,6 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr) declare void @extern() define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) { -; GFX9-LABEL: void_func_a13i32_inreg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s27, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[28:29] -; GFX9-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48 -; GFX9-NEXT: v_mov_b32_e32 v5, s25 -; GFX9-NEXT: v_mov_b32_e32 v4, s24 -; GFX9-NEXT: v_mov_b32_e32 v3, s23 -; GFX9-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: v_writelane_b32 v40, s27, 2 -; GFX9-NEXT: v_mov_b32_e32 v5, s21 -; GFX9-NEXT: v_mov_b32_e32 v4, s20 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_a13i32_inreg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s23, s33 -; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s24, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s24 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19 -; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17 -; GFX11-NEXT: s_getpc_b64 s[18:19] -; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7 -; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s23, 2 -; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21 -; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] store [13 x i32] %arg0, ptr addrspace(1) %ptr call void @extern() ret void @@ -2290,52 +2203,6 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; FIXME: Should still fail define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) { -; GFX9-LABEL: void_func_a16i32_inreg__noimplicit: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s19 -; GFX9-NEXT: v_mov_b32_e32 v4, s18 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: void_func_a16i32_inreg__noimplicit: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14 -; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12 -; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10 -; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8 -; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6 -; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4 -; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2 -; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48 -; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32 -; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16 -; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off -; GFX11-NEXT: s_setpc_b64 s[30:31] store [16 x i32] %arg0, ptr addrspace(1) %ptr ret void } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 3e1db5fb4e1dc..a118fa388f86d 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -9567,17 +9567,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9595,17 +9597,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9623,18 +9627,20 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9652,17 +9658,19 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9684,17 +9692,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x7b +; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9712,17 +9722,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9740,18 +9752,20 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9769,17 +9783,19 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9801,17 +9817,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 42 +; GFX9-NEXT: s_mov_b32 s4, 42 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9829,17 +9847,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9857,18 +9877,20 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 42 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 42 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -9886,17 +9908,19 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -9918,18 +9942,22 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x7b -; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -9947,18 +9975,22 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x7b -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -9976,19 +10008,23 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x7b -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10006,18 +10042,22 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x7b -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10039,25 +10079,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10073,23 +10116,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10107,19 +10153,27 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10137,18 +10191,26 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10171,20 +10233,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10202,20 +10272,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10233,21 +10311,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10265,20 +10351,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10300,29 +10394,32 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s8, 1 +; GFX9-NEXT: s_mov_b32 s9, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 7 +; GFX9-NEXT: v_readlane_b32 s30, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10340,29 +10437,32 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 8 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10380,25 +10480,33 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 8 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 1 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 7 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 +; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10416,24 +10524,32 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10458,35 +10574,38 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: s_mov_b64 s[34:35], 0 -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, 1 -; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: s_mov_b32 s6, 3 -; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 1 +; GFX9-NEXT: s_mov_b32 s9, 2 +; GFX9-NEXT: s_mov_b32 s10, 3 +; GFX9-NEXT: s_mov_b32 s11, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 +; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10504,35 +10623,38 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 4 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 1 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 2 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: s_mov_b32 s10, 3 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: s_mov_b32 s11, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10550,31 +10672,39 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 1 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 2 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: s_mov_b32 s10, 3 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_mov_b32 s11, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10592,30 +10722,38 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10639,17 +10777,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo -; GFX9-NEXT: s_movk_i32 s0, 0x4400 +; GFX9-NEXT: s_movk_i32 s4, 0x4400 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10667,17 +10807,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo -; GFX10-NEXT: s_movk_i32 s0, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10695,18 +10837,20 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo -; GFX11-NEXT: s_movk_i32 s0, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10724,17 +10868,19 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_movk_i32 s0, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10756,17 +10902,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 4.0 +; GFX9-NEXT: s_mov_b32 s4, 4.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10784,17 +10932,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10812,18 +10962,20 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10841,17 +10993,19 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -10873,18 +11027,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1.0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -10902,18 +11060,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1.0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -10931,19 +11093,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1.0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -10961,24 +11127,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg ) @@ -10994,19 +11164,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 5 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1.0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s2, 4.0 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 4 +; GFX9-NEXT: v_readlane_b32 s30, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11024,19 +11200,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 5 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1.0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11054,20 +11236,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1.0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 5 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 +; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11085,19 +11273,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11119,23 +11313,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1.0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s2, 4.0 -; GFX9-NEXT: s_mov_b32 s3, -1.0 -; GFX9-NEXT: s_mov_b32 s4, 0.5 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: s_mov_b32 s7, -1.0 +; GFX9-NEXT: s_mov_b32 s8, 0.5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 6 +; GFX9-NEXT: v_readlane_b32 s30, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11153,23 +11355,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 7 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1.0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 4.0 -; GFX10-NEXT: s_mov_b32 s3, -1.0 -; GFX10-NEXT: s_mov_b32 s4, 0.5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, -1.0 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 0.5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11187,24 +11397,32 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1.0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 4.0 -; GFX11-NEXT: s_mov_b32 s3, -1.0 -; GFX11-NEXT: s_mov_b32 s4, 0.5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 7 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, -1.0 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 0.5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 5 +; GFX11-NEXT: v_writelane_b32 v40, s31, 6 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11222,23 +11440,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 4.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, -1.0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, -1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0.5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11260,18 +11486,22 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 0x40100000 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 0x40100000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11289,18 +11519,22 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11318,19 +11552,23 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11348,18 +11586,22 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11381,20 +11623,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s3, 0x40100000 +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11412,20 +11662,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 -; GFX10-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11443,21 +11701,29 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11475,20 +11741,28 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11510,26 +11784,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_mov_b32 s1, 2.0 -; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: s_mov_b32 s3, 0x40100000 ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s5, 0x40200000 +; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_mov_b32 s9, 0x40200000 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 -; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 7 +; GFX9-NEXT: v_readlane_b32 s30, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11547,26 +11829,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s34, 8 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: s_mov_b32 s1, 2.0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 0x40200000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_mov_b32 s5, 2.0 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 0x40200000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11584,27 +11874,35 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b32 s1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 8 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s3, 0x40100000 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 0x40200000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 2 -; GFX11-NEXT: v_writelane_b32 v40, s31, 3 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 0x40200000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 7 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 -; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 +; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11622,26 +11920,34 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 0x40100000 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40200000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 0x40200000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11663,17 +11969,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11691,17 +11999,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11719,18 +12029,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11748,17 +12060,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11781,20 +12095,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11812,20 +12127,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11843,18 +12159,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11872,17 +12192,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -11905,20 +12229,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -11936,20 +12261,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -11967,18 +12293,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -11996,17 +12326,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12029,18 +12363,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0x20001 -; GFX9-NEXT: s_mov_b32 s1, 3 +; GFX9-NEXT: s_mov_b32 s4, 0x20001 +; GFX9-NEXT: s_mov_b32 s5, 3 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12058,18 +12396,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-NEXT: s_mov_b32 s1, 3 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12087,19 +12429,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0x20001 -; GFX11-NEXT: s_mov_b32 s1, 3 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12117,18 +12463,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12150,18 +12500,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX9-NEXT: s_movk_i32 s1, 0x4400 +; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX9-NEXT: s_movk_i32 s5, 0x4400 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12179,18 +12533,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX10-NEXT: s_movk_i32 s1, 0x4400 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12208,19 +12566,23 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX11-NEXT: s_movk_i32 s1, 0x4400 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_movk_i32 s5, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12238,18 +12600,22 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x40003c00 -; GFX10-SCRATCH-NEXT: s_movk_i32 s1, 0x4400 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12271,20 +12637,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12302,20 +12669,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12333,18 +12701,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12362,19 +12734,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12395,18 +12771,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 0x20001 -; GFX9-NEXT: s_mov_b32 s1, 0x40003 +; GFX9-NEXT: s_mov_b32 s4, 0x20001 +; GFX9-NEXT: s_mov_b32 s5, 0x40003 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12424,18 +12804,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-NEXT: s_mov_b32 s1, 0x40003 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12453,19 +12837,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 0x20001 -; GFX11-NEXT: s_mov_b32 s1, 0x40003 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40003 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12483,18 +12871,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 0x20001 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 0x40003 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12516,17 +12908,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12544,17 +12938,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_load_dword s0, s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12572,18 +12968,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12601,17 +12999,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12634,20 +13034,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s34 -; GFX9-NEXT: s_mov_b32 s1, s35 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12665,20 +13066,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s34 -; GFX10-NEXT: s_mov_b32 s1, s35 -; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12696,18 +13098,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12725,17 +13131,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12758,18 +13168,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 +; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12787,18 +13201,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12816,19 +13234,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12846,18 +13268,22 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -12879,19 +13305,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 5 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 3 -; GFX9-NEXT: s_mov_b32 s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 5 +; GFX9-NEXT: s_mov_b32 s4, 3 +; GFX9-NEXT: s_mov_b32 s5, 4 +; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 4 +; GFX9-NEXT: v_readlane_b32 s30, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -12909,19 +13341,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 5 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 3 -; GFX10-NEXT: s_mov_b32 s1, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -12939,20 +13377,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 3 -; GFX11-NEXT: s_mov_b32 s1, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 5 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 +; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -12970,19 +13414,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13004,20 +13454,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 3 -; GFX9-NEXT: s_mov_b32 s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 5 -; GFX9-NEXT: s_mov_b32 s3, 6 +; GFX9-NEXT: s_mov_b32 s4, 3 +; GFX9-NEXT: s_mov_b32 s5, 4 +; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: s_mov_b32 s7, 6 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13035,20 +13493,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 3 -; GFX10-NEXT: s_mov_b32 s1, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 5 -; GFX10-NEXT: s_mov_b32 s3, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 4 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 5 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 6 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13066,21 +13532,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 3 -; GFX11-NEXT: s_mov_b32 s1, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 5 -; GFX11-NEXT: s_mov_b32 s3, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 5 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 6 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13098,20 +13572,28 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 5 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13133,22 +13615,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13166,22 +13651,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_load_dwordx4 s[36:39], s[34:35], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13199,18 +13687,26 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13228,17 +13724,25 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13261,20 +13765,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 +; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13292,20 +13804,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13323,21 +13843,29 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13355,20 +13883,28 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13390,23 +13926,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 -; GFX9-NEXT: s_mov_b32 s4, 5 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 5 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 6 +; GFX9-NEXT: v_readlane_b32 s30, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 +; GFX9-NEXT: v_readlane_b32 s7, v40, 3 +; GFX9-NEXT: v_readlane_b32 s6, v40, 2 +; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13424,23 +13968,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 7 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 -; GFX10-NEXT: s_mov_b32 s4, 5 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-NEXT: s_mov_b32 s5, 2 +; GFX10-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-NEXT: s_mov_b32 s6, 3 +; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13458,24 +14010,32 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 -; GFX11-NEXT: s_mov_b32 s4, 5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 7 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 5 +; GFX11-NEXT: v_writelane_b32 v40, s31, 6 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13493,23 +14053,31 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13531,36 +14099,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx8 s[36:43], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 +; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13578,36 +14145,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx8 s[36:43], s[34:35], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[34:35], 0x0 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13625,28 +14191,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13664,27 +14238,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13708,32 +14290,40 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s8, 4 +; GFX9-NEXT: v_writelane_b32 v40, s9, 5 +; GFX9-NEXT: v_writelane_b32 v40, s10, 6 +; GFX9-NEXT: v_writelane_b32 v40, s11, 7 +; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, 1 -; GFX9-NEXT: s_mov_b32 s1, 2 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 4 -; GFX9-NEXT: s_mov_b32 s4, 5 -; GFX9-NEXT: s_mov_b32 s5, 6 -; GFX9-NEXT: s_mov_b32 s6, 7 -; GFX9-NEXT: s_mov_b32 s7, 8 +; GFX9-NEXT: s_mov_b32 s4, 1 +; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: s_mov_b32 s6, 3 +; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: s_mov_b32 s9, 6 +; GFX9-NEXT: s_mov_b32 s10, 7 +; GFX9-NEXT: s_mov_b32 s11, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 +; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s11, v40, 7 +; GFX9-NEXT: v_readlane_b32 s10, v40, 6 +; GFX9-NEXT: v_readlane_b32 s9, v40, 5 +; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13751,32 +14341,40 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, 1 -; GFX10-NEXT: s_mov_b32 s1, 2 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s2, 3 -; GFX10-NEXT: s_mov_b32 s3, 4 -; GFX10-NEXT: s_mov_b32 s4, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, 6 +; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, 7 +; GFX10-NEXT: s_mov_b32 s6, 3 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, 8 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: s_mov_b32 s7, 4 +; GFX10-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-NEXT: s_mov_b32 s8, 5 +; GFX10-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-NEXT: s_mov_b32 s9, 6 +; GFX10-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-NEXT: s_mov_b32 s10, 7 +; GFX10-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-NEXT: s_mov_b32 s11, 8 +; GFX10-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -13794,33 +14392,41 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX11-NEXT: s_mov_b32 s0, 1 -; GFX11-NEXT: s_mov_b32 s1, 2 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s2, 3 -; GFX11-NEXT: s_mov_b32 s3, 4 -; GFX11-NEXT: s_mov_b32 s4, 5 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 -; GFX11-NEXT: s_mov_b32 s5, 6 +; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 -; GFX11-NEXT: s_mov_b32 s6, 7 +; GFX11-NEXT: s_mov_b32 s6, 3 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 -; GFX11-NEXT: s_mov_b32 s7, 8 -; GFX11-NEXT: v_writelane_b32 v40, s30, 4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 5 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 5 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 6 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: s_mov_b32 s10, 7 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_mov_b32 s11, 8 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 -; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -13838,32 +14444,40 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_mov_b32 s0, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, 4 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s6, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s7, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s8, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s9, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 +; GFX10-SCRATCH-NEXT: s_mov_b32 s10, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s11, 7 +; GFX10-SCRATCH-NEXT: s_mov_b32 s11, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -13885,47 +14499,38 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 14 +; GFX9-NEXT: v_writelane_b32 v40, s34, 18 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 -; GFX9-NEXT: v_writelane_b32 v40, s30, 12 +; GFX9-NEXT: v_writelane_b32 v40, s16, 12 +; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: s_mov_b32 s8, s44 -; GFX9-NEXT: s_mov_b32 s9, s45 -; GFX9-NEXT: s_mov_b32 s10, s46 -; GFX9-NEXT: s_mov_b32 s11, s47 -; GFX9-NEXT: s_mov_b32 s12, s48 -; GFX9-NEXT: s_mov_b32 s13, s49 -; GFX9-NEXT: s_mov_b32 s14, s50 -; GFX9-NEXT: s_mov_b32 s15, s51 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 13 +; GFX9-NEXT: v_writelane_b32 v40, s31, 17 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 13 -; GFX9-NEXT: v_readlane_b32 s30, v40, 12 +; GFX9-NEXT: v_readlane_b32 s31, v40, 17 +; GFX9-NEXT: v_readlane_b32 s30, v40, 16 +; GFX9-NEXT: v_readlane_b32 s19, v40, 15 +; GFX9-NEXT: v_readlane_b32 s18, v40, 14 +; GFX9-NEXT: v_readlane_b32 s17, v40, 13 +; GFX9-NEXT: v_readlane_b32 s16, v40, 12 ; GFX9-NEXT: v_readlane_b32 s15, v40, 11 ; GFX9-NEXT: v_readlane_b32 s14, v40, 10 ; GFX9-NEXT: v_readlane_b32 s13, v40, 9 @@ -13938,7 +14543,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 14 +; GFX9-NEXT: v_readlane_b32 s34, v40, 18 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -13956,47 +14561,38 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 14 +; GFX10-NEXT: v_writelane_b32 v40, s34, 18 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-NEXT: v_writelane_b32 v40, s10, 6 ; GFX10-NEXT: v_writelane_b32 v40, s11, 7 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: v_writelane_b32 v40, s12, 8 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 ; GFX10-NEXT: v_writelane_b32 v40, s13, 9 -; GFX10-NEXT: s_mov_b32 s8, s44 -; GFX10-NEXT: s_mov_b32 s9, s45 -; GFX10-NEXT: s_mov_b32 s10, s46 -; GFX10-NEXT: s_mov_b32 s11, s47 ; GFX10-NEXT: v_writelane_b32 v40, s14, 10 -; GFX10-NEXT: s_mov_b32 s12, s48 -; GFX10-NEXT: s_mov_b32 s13, s49 -; GFX10-NEXT: s_mov_b32 s14, s50 ; GFX10-NEXT: v_writelane_b32 v40, s15, 11 -; GFX10-NEXT: s_mov_b32 s15, s51 -; GFX10-NEXT: v_writelane_b32 v40, s30, 12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 13 +; GFX10-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo +; GFX10-NEXT: v_writelane_b32 v40, s30, 16 +; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 13 -; GFX10-NEXT: v_readlane_b32 s30, v40, 12 +; GFX10-NEXT: v_readlane_b32 s31, v40, 17 +; GFX10-NEXT: v_readlane_b32 s30, v40, 16 +; GFX10-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-NEXT: v_readlane_b32 s16, v40, 12 ; GFX10-NEXT: v_readlane_b32 s15, v40, 11 ; GFX10-NEXT: v_readlane_b32 s14, v40, 10 ; GFX10-NEXT: v_readlane_b32 s13, v40, 9 @@ -14009,7 +14605,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 14 +; GFX10-NEXT: v_readlane_b32 s34, v40, 18 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -14027,10 +14623,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 14 +; GFX11-NEXT: v_writelane_b32 v40, s0, 18 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -14044,14 +14638,24 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s13, 9 ; GFX11-NEXT: v_writelane_b32 v40, s14, 10 ; GFX11-NEXT: v_writelane_b32 v40, s15, 11 +; GFX11-NEXT: v_writelane_b32 v40, s16, 12 +; GFX11-NEXT: v_writelane_b32 v40, s17, 13 +; GFX11-NEXT: v_writelane_b32 v40, s18, 14 +; GFX11-NEXT: v_writelane_b32 v40, s19, 15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 13 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo +; GFX11-NEXT: v_writelane_b32 v40, s30, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 17 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 13 -; GFX11-NEXT: v_readlane_b32 s30, v40, 12 +; GFX11-NEXT: v_readlane_b32 s31, v40, 17 +; GFX11-NEXT: v_readlane_b32 s30, v40, 16 +; GFX11-NEXT: v_readlane_b32 s19, v40, 15 +; GFX11-NEXT: v_readlane_b32 s18, v40, 14 +; GFX11-NEXT: v_readlane_b32 s17, v40, 13 +; GFX11-NEXT: v_readlane_b32 s16, v40, 12 ; GFX11-NEXT: v_readlane_b32 s15, v40, 11 ; GFX11-NEXT: v_readlane_b32 s14, v40, 10 ; GFX11-NEXT: v_readlane_b32 s13, v40, 9 @@ -14064,7 +14668,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 14 +; GFX11-NEXT: v_readlane_b32 s0, v40, 18 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 @@ -14082,10 +14686,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14099,13 +14701,23 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s13, 9 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s14, 10 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s15, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s16, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 13 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 12 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s16, v40, 12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s15, v40, 11 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s14, v40, 10 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s13, v40, 9 @@ -14118,7 +14730,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 @@ -14159,47 +14771,49 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 ; GFX9-NEXT: v_writelane_b32 v40, s23, 19 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 -; GFX9-NEXT: v_writelane_b32 v40, s28, 24 -; GFX9-NEXT: v_writelane_b32 v40, s29, 25 -; GFX9-NEXT: v_writelane_b32 v40, s30, 26 -; GFX9-NEXT: v_writelane_b32 v40, s31, 27 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_mov_b32 s53, external_void_func_v32i32_inreg@abs32@hi -; GFX9-NEXT: s_mov_b32 s52, external_void_func_v32i32_inreg@abs32@lo +; GFX9-NEXT: v_writelane_b32 v40, s28, 24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s30 -; GFX9-NEXT: v_mov_b32_e32 v1, s31 +; GFX9-NEXT: v_mov_b32_e32 v0, s46 +; GFX9-NEXT: v_writelane_b32 v40, s29, 25 +; GFX9-NEXT: v_mov_b32_e32 v1, s47 +; GFX9-NEXT: v_mov_b32_e32 v2, s48 +; GFX9-NEXT: v_mov_b32_e32 v3, s49 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: s_mov_b32 s8, s44 -; GFX9-NEXT: s_mov_b32 s9, s45 -; GFX9-NEXT: s_mov_b32 s10, s46 -; GFX9-NEXT: s_mov_b32 s11, s47 -; GFX9-NEXT: s_mov_b32 s12, s48 -; GFX9-NEXT: s_mov_b32 s13, s49 -; GFX9-NEXT: s_mov_b32 s14, s50 -; GFX9-NEXT: s_mov_b32 s15, s51 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[52:53] +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s50 +; GFX9-NEXT: v_writelane_b32 v40, s30, 26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s51 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo +; GFX9-NEXT: s_mov_b32 s20, s36 +; GFX9-NEXT: s_mov_b32 s21, s37 +; GFX9-NEXT: s_mov_b32 s22, s38 +; GFX9-NEXT: s_mov_b32 s23, s39 +; GFX9-NEXT: s_mov_b32 s24, s40 +; GFX9-NEXT: s_mov_b32 s25, s41 +; GFX9-NEXT: s_mov_b32 s26, s42 +; GFX9-NEXT: s_mov_b32 s27, s43 +; GFX9-NEXT: s_mov_b32 s28, s44 +; GFX9-NEXT: s_mov_b32 s29, s45 +; GFX9-NEXT: v_writelane_b32 v40, s31, 27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 @@ -14265,46 +14879,47 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-NEXT: s_mov_b32 s20, s36 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: s_mov_b32 s21, s37 +; GFX10-NEXT: s_mov_b32 s22, s38 +; GFX10-NEXT: s_mov_b32 s23, s39 +; GFX10-NEXT: s_mov_b32 s24, s40 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: s_mov_b32 s25, s41 +; GFX10-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-NEXT: s_mov_b32 s26, s42 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-NEXT: s_mov_b32 s28, s44 ; GFX10-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s30 -; GFX10-NEXT: v_mov_b32_e32 v1, s31 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 -; GFX10-NEXT: s_mov_b32 s8, s44 -; GFX10-NEXT: s_mov_b32 s9, s45 -; GFX10-NEXT: s_mov_b32 s10, s46 -; GFX10-NEXT: s_mov_b32 s11, s47 -; GFX10-NEXT: s_mov_b32 s12, s48 -; GFX10-NEXT: s_mov_b32 s13, s49 -; GFX10-NEXT: s_mov_b32 s14, s50 -; GFX10-NEXT: s_mov_b32 s15, s51 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 @@ -14355,8 +14970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -14373,26 +14988,42 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s17, 13 ; GFX11-NEXT: v_writelane_b32 v40, s18, 14 ; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51 ; GFX11-NEXT: v_writelane_b32 v40, s23, 19 +; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47 +; GFX11-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 +; GFX11-NEXT: s_mov_b32 s20, s36 +; GFX11-NEXT: s_mov_b32 s21, s37 +; GFX11-NEXT: s_mov_b32 s22, s38 +; GFX11-NEXT: s_mov_b32 s23, s39 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s24, s40 +; GFX11-NEXT: s_mov_b32 s25, s41 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s2 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 +; GFX11-NEXT: s_mov_b32 s26, s42 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 +; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 +; GFX11-NEXT: s_mov_b32 s28, s44 ; GFX11-NEXT: v_writelane_b32 v40, s29, 25 +; GFX11-NEXT: s_mov_b32 s29, s45 ; GFX11-NEXT: v_writelane_b32 v40, s30, 26 ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b512 s[16:31], s[0:1], 0x40 -; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v1, s31 -; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 @@ -14440,8 +15071,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -14458,29 +15090,44 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 +; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 +; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 +; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s30 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s31 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 @@ -14549,53 +15196,55 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 ; GFX9-NEXT: v_writelane_b32 v40, s21, 17 ; GFX9-NEXT: v_writelane_b32 v40, s22, 18 ; GFX9-NEXT: v_writelane_b32 v40, s23, 19 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 +; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX9-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s24, 20 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s25, 21 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s26, 22 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s52 ; GFX9-NEXT: v_writelane_b32 v40, s27, 23 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, s46 ; GFX9-NEXT: v_writelane_b32 v40, s28, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s47 +; GFX9-NEXT: v_mov_b32_e32 v2, s48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s49 ; GFX9-NEXT: v_writelane_b32 v40, s29, 25 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s50 ; GFX9-NEXT: v_writelane_b32 v40, s30, 26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s51 +; GFX9-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo +; GFX9-NEXT: s_mov_b32 s20, s36 +; GFX9-NEXT: s_mov_b32 s21, s37 +; GFX9-NEXT: s_mov_b32 s22, s38 +; GFX9-NEXT: s_mov_b32 s23, s39 +; GFX9-NEXT: s_mov_b32 s24, s40 +; GFX9-NEXT: s_mov_b32 s25, s41 +; GFX9-NEXT: s_mov_b32 s26, s42 +; GFX9-NEXT: s_mov_b32 s27, s43 +; GFX9-NEXT: s_mov_b32 s28, s44 +; GFX9-NEXT: s_mov_b32 s29, s45 ; GFX9-NEXT: v_writelane_b32 v40, s31, 27 -; GFX9-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX9-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s52 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GFX9-NEXT: v_mov_b32_e32 v0, s30 -; GFX9-NEXT: s_mov_b32 s53, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX9-NEXT: v_mov_b32_e32 v1, s31 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: s_mov_b32 s52, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s36 -; GFX9-NEXT: s_mov_b32 s1, s37 -; GFX9-NEXT: s_mov_b32 s2, s38 -; GFX9-NEXT: s_mov_b32 s3, s39 -; GFX9-NEXT: s_mov_b32 s4, s40 -; GFX9-NEXT: s_mov_b32 s5, s41 -; GFX9-NEXT: s_mov_b32 s6, s42 -; GFX9-NEXT: s_mov_b32 s7, s43 -; GFX9-NEXT: s_mov_b32 s8, s44 -; GFX9-NEXT: s_mov_b32 s9, s45 -; GFX9-NEXT: s_mov_b32 s10, s46 -; GFX9-NEXT: s_mov_b32 s11, s47 -; GFX9-NEXT: s_mov_b32 s12, s48 -; GFX9-NEXT: s_mov_b32 s13, s49 -; GFX9-NEXT: s_mov_b32 s14, s50 -; GFX9-NEXT: s_mov_b32 s15, s51 -; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 -; GFX9-NEXT: ; kill: killed $sgpr34_sgpr35 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[52:53] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 @@ -14661,51 +15310,52 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0 +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: ; meta instruction +; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x40 +; GFX10-NEXT: s_load_dwordx16 s[4:19], s[34:35], 0x0 +; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s52 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_mov_b32_e32 v2, s48 +; GFX10-NEXT: v_mov_b32_e32 v3, s49 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: s_mov_b32 s20, s36 +; GFX10-NEXT: s_mov_b32 s21, s37 +; GFX10-NEXT: s_mov_b32 s22, s38 +; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: s_mov_b32 s24, s40 +; GFX10-NEXT: s_mov_b32 s25, s41 +; GFX10-NEXT: v_mov_b32_e32 v4, s50 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; GFX10-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-NEXT: s_mov_b32 s27, s43 ; GFX10-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-NEXT: s_mov_b32 s28, s44 ; GFX10-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-NEXT: s_mov_b32 s29, s45 ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s52, s[34:35], 0x0 -; GFX10-NEXT: ; meta instruction -; GFX10-NEXT: ; meta instruction -; GFX10-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-NEXT: s_load_dwordx16 s[36:51], s[34:35], 0x0 -; GFX10-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s52 -; GFX10-NEXT: v_mov_b32_e32 v1, s30 -; GFX10-NEXT: v_mov_b32_e32 v2, s31 -; GFX10-NEXT: s_mov_b32 s4, s40 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 -; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 -; GFX10-NEXT: s_mov_b32 s5, s41 -; GFX10-NEXT: s_mov_b32 s6, s42 -; GFX10-NEXT: s_mov_b32 s7, s43 -; GFX10-NEXT: s_mov_b32 s8, s44 -; GFX10-NEXT: s_mov_b32 s9, s45 -; GFX10-NEXT: s_mov_b32 s10, s46 -; GFX10-NEXT: s_mov_b32 s11, s47 -; GFX10-NEXT: s_mov_b32 s12, s48 -; GFX10-NEXT: s_mov_b32 s13, s49 -; GFX10-NEXT: s_mov_b32 s14, s50 -; GFX10-NEXT: s_mov_b32 s15, s51 -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s0, s36 -; GFX10-NEXT: s_mov_b32 s1, s37 -; GFX10-NEXT: s_mov_b32 s2, s38 -; GFX10-NEXT: s_mov_b32 s3, s39 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 @@ -14756,8 +15406,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX11-NEXT: s_add_i32 s36, s32, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s3, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -14774,30 +15424,46 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s17, 13 ; GFX11-NEXT: v_writelane_b32 v40, s18, 14 ; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX11-NEXT: v_writelane_b32 v40, s20, 16 ; GFX11-NEXT: v_writelane_b32 v40, s21, 17 ; GFX11-NEXT: v_writelane_b32 v40, s22, 18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51 ; GFX11-NEXT: v_writelane_b32 v40, s23, 19 +; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47 +; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49 ; GFX11-NEXT: v_writelane_b32 v40, s24, 20 +; GFX11-NEXT: v_mov_b32_e32 v2, s48 +; GFX11-NEXT: s_add_i32 s2, s32, 24 +; GFX11-NEXT: s_mov_b32 s20, s36 +; GFX11-NEXT: s_mov_b32 s21, s37 ; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s22, s38 +; GFX11-NEXT: s_mov_b32 s23, s39 +; GFX11-NEXT: s_mov_b32 s24, s40 +; GFX11-NEXT: s_mov_b32 s25, s41 ; GFX11-NEXT: v_writelane_b32 v40, s26, 22 +; GFX11-NEXT: s_mov_b32 s26, s42 +; GFX11-NEXT: scratch_store_b32 off, v6, s2 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s3 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_writelane_b32 v40, s27, 23 +; GFX11-NEXT: s_mov_b32 s27, s43 ; GFX11-NEXT: v_writelane_b32 v40, s28, 24 +; GFX11-NEXT: s_mov_b32 s28, s44 ; GFX11-NEXT: v_writelane_b32 v40, s29, 25 +; GFX11-NEXT: s_mov_b32 s29, s45 ; GFX11-NEXT: v_writelane_b32 v40, s30, 26 ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x0 -; GFX11-NEXT: s_load_b512 s[16:31], s[0:1], 0x40 -; GFX11-NEXT: s_load_b512 s[0:15], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v1, s31 -; GFX11-NEXT: v_mov_b32_e32 v0, s30 -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX11-NEXT: scratch_store_b32 off, v2, s36 -; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 @@ -14845,17 +15511,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_load_dword s36, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_add_i32 s3, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s36 -; GFX10-SCRATCH-NEXT: s_add_i32 s36, s32, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s8, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s9, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s10, 6 @@ -14868,29 +15530,50 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s17, 13 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s18, 14 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s19, 15 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: s_clause 0x2 +; GFX10-SCRATCH-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: ; meta instruction +; GFX10-SCRATCH-NEXT: ; meta instruction +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x40 +; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s20, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s21, 17 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 +; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 +; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 +; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 +; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3 +; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 +; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 +; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24 +; GFX10-SCRATCH-NEXT: s_mov_b32 s28, s44 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s29, 25 +; GFX10-SCRATCH-NEXT: s_mov_b32 s29, s45 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 -; GFX10-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo -; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s30 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s31 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v2, s36 -; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[0:1], s32 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 @@ -16714,7 +17397,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -16741,7 +17423,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -16761,18 +17442,18 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX11-LABEL: test_call_external_void_func_bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -16788,19 +17469,19 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -16830,7 +17511,6 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v1bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v1bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -16857,7 +17537,6 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v1bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v1bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -16877,18 +17556,18 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v1bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v1bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -16904,19 +17583,19 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v1bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v1bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v1bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v1bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -16946,7 +17625,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v2bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v2bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -16973,7 +17651,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v2bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v2bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 @@ -16993,18 +17670,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-NEXT: v_writelane_b32 v40, s1, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v2bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v2bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v2bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v2bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17020,19 +17697,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s1, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s2, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s1, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v2bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v2bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v2bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v2bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17062,8 +17739,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17090,10 +17765,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17111,18 +17784,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s3, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s3 -; GFX11-NEXT: v_writelane_b32 v40, s2, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v3bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v3bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17138,19 +17811,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s3, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s2, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v3bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v3bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17180,8 +17853,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v4bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v4bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17208,10 +17879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v4bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v4bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17229,18 +17898,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s3, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s3 -; GFX11-NEXT: v_writelane_b32 v40, s2, 2 -; GFX11-NEXT: s_mov_b32 s3, external_void_func_v4bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s2, external_void_func_v4bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v4bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v4bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17256,19 +17925,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s3, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s2, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s3, external_void_func_v4bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s2, external_void_func_v4bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v4bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v4bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17298,10 +17967,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s3, s7 -; GFX9-NEXT: s_mov_b32 s2, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -17328,12 +17993,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s3, s7 -; GFX10-NEXT: s_mov_b32 s2, s6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 @@ -17351,18 +18012,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s35, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s35 -; GFX11-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v8bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v8bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17378,19 +18039,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v8bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v8bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v8bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 @@ -17416,32 +18077,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo -; GFX9-NEXT: s_mov_b32 s3, s7 -; GFX9-NEXT: s_mov_b32 s2, s6 -; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, s8 -; GFX9-NEXT: s_mov_b32 s5, s9 -; GFX9-NEXT: s_mov_b32 s6, s10 -; GFX9-NEXT: s_mov_b32 s7, s11 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 5 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 -; GFX9-NEXT: v_readlane_b32 s30, v40, 4 -; GFX9-NEXT: v_readlane_b32 s7, v40, 3 -; GFX9-NEXT: v_readlane_b32 s6, v40, 2 -; GFX9-NEXT: v_readlane_b32 s5, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] @@ -17459,32 +18104,16 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo -; GFX10-NEXT: s_mov_b32 s3, s7 -; GFX10-NEXT: s_mov_b32 s2, s6 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s1, s5 -; GFX10-NEXT: s_mov_b32 s0, s4 -; GFX10-NEXT: s_mov_b32 s4, s8 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s5, 1 -; GFX10-NEXT: s_mov_b32 s5, s9 -; GFX10-NEXT: v_writelane_b32 v40, s6, 2 -; GFX10-NEXT: s_mov_b32 s6, s10 -; GFX10-NEXT: v_writelane_b32 v40, s7, 3 -; GFX10-NEXT: s_mov_b32 s7, s11 -; GFX10-NEXT: v_writelane_b32 v40, s30, 4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 5 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 -; GFX10-NEXT: v_readlane_b32 s30, v40, 4 -; GFX10-NEXT: v_readlane_b32 s7, v40, 3 -; GFX10-NEXT: v_readlane_b32 s6, v40, 2 -; GFX10-NEXT: v_readlane_b32 s5, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -17497,18 +18126,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s34, s33 +; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s35, -1 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s35 -; GFX11-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi -; GFX11-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_mov_b32 s1, external_void_func_v16bf16@abs32@hi +; GFX11-NEXT: s_mov_b32 s0, external_void_func_v16bf16@abs32@lo ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 @@ -17524,19 +18153,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 -; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s35, -1 +; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-SCRATCH-NEXT: s_mov_b32 s35, external_void_func_v16bf16@abs32@hi -; GFX10-SCRATCH-NEXT: s_mov_b32 s34, external_void_func_v16bf16@abs32@lo +; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v16bf16@abs32@hi +; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16bf16@abs32@lo ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 25c684004446f..7799b9509ceb0 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -847,11 +847,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s5, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -885,19 +885,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s61, 29 ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_movk_i32 s0, 0x7b -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] +; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_readlane_b32 s63, v40, 31 ; GCN-NEXT: v_readlane_b32 s62, v40, 30 ; GCN-NEXT: v_readlane_b32 s61, v40, 29 @@ -930,22 +930,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s5, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -979,19 +979,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s61, 29 ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: s_mov_b64 s[4:5], exec +; GISEL-NEXT: s_mov_b64 s[6:7], exec +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s6, v0 -; GISEL-NEXT: v_readfirstlane_b32 s7, v1 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1] -; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: s_movk_i32 s0, 0x7b -; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GISEL-NEXT: v_readfirstlane_b32 s8, v0 +; GISEL-NEXT: v_readfirstlane_b32 s9, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] +; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 -; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] +; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] ; GISEL-NEXT: s_cbranch_execnz .LBB6_1 ; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: v_readlane_b32 s63, v40, 31 ; GISEL-NEXT: v_readlane_b32 s62, v40, 30 ; GISEL-NEXT: v_readlane_b32 s61, v40, 29 @@ -1024,11 +1024,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 inreg 123) diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll index 27e9d0fce9426..0139c52db1d58 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll @@ -10,7 +10,7 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3 ; CHECK-NEXT: scratch_load_b32 v2, v0, off ; CHECK-NEXT: scratch_load_b32 v3, v3, off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_b64 v[2:3], v1, s[4:7], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] %x0 = load i32, ptr addrspace(5) %src diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll index ec1de0222d5d9..cdaac14833e0e 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll @@ -6,7 +6,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg % ; GCN-LABEL: sink_scratch_pointer: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb2 ; GCN-NEXT: scratch_load_b32 v0, v0, off offset:-4 @@ -21,7 +21,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg % ; GISEL-LABEL: sink_scratch_pointer: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_cbranch_scc0 .LBB0_2 ; GISEL-NEXT: ; %bb.1: ; %bb2 ; GISEL-NEXT: scratch_load_b32 v0, v0, off offset:-4