diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index cbd08f0fb5dff..5161a75097aeb 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -267,11 +267,19 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( std::vector CSI; const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + Register RetAddrReg = TRI->getReturnAddressReg(MF); + bool SpillRetAddrReg = false; for (unsigned I = 0; CSRegs[I]; ++I) { MCRegister Reg = CSRegs[I]; if (SavedRegs.test(Reg)) { + if (Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub0) || + Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub1)) { + SpillRetAddrReg = true; + continue; + } + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MVT::i32); int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), @@ -282,6 +290,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( } } + // Return address uses a register pair. Add the super register to the + // CSI list so that it's easier to identify the entire spill and CFI + // can be emitted appropriately. + if (SpillRetAddrReg) { + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64); + int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlign(*RC), true); + CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI)); + CalleeSavedFIs.push_back(JunkFI); + } + if (!CSI.empty()) { for (MachineBasicBlock *SaveBlock : SaveBlocks) insertCSRSaves(*SaveBlock, CSI, Indexes, LIS); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll index b84b31cd2702c..023398377de94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() { ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 7e6f500181ec6..2c1beb8468576 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -238,8 +238,8 @@ define void @func_caller_stack() #2 { ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -277,8 +277,8 @@ define void @func_caller_stack() #2 { ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 { ; MUBUF-NEXT: s_waitcnt vmcnt(1) ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 72766f47030cc..35591cd602992 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -244,8 +244,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 3194581fa4213..0e24430e7be20 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 { ; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12 ; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] -; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 +; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: s_mov_b32 s32, s33 ; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll index 149b0cb4e052d..b6e65c8842904 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll @@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) ; DAGISEL-NEXT: s_clause 0x1 ; DAGISEL-NEXT: scratch_load_b32 v41, off, s33 ; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1 ; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0 +; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1 ; DAGISEL-NEXT: s_mov_b32 s32, s33 ; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2 ; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 @@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: scratch_load_b32 v41, off, s33 ; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GISEL-NEXT: v_readlane_b32 s31, v42, 1 ; GISEL-NEXT: v_readlane_b32 s30, v42, 0 +; GISEL-NEXT: v_readlane_b32 s31, v42, 1 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s0, v42, 2 ; GISEL-NEXT: s_or_saveexec_b32 s1, -1 @@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 { ; DAGISEL-NEXT: s_wait_alu 0xfffe ; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1 ; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0 +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1 ; DAGISEL-NEXT: s_mov_b32 s32, s33 ; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2 ; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 @@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 { ; GISEL-NEXT: s_wait_alu 0xfffe ; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s0, v40, 2 ; GISEL-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 6fae7fdbbf9bb..1a6e0e27812fe 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -16525,18 +16525,17 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v8, s30, 0 -; SI-NEXT: v_writelane_b32 v8, s31, 1 -; SI-NEXT: v_writelane_b32 v8, s34, 2 -; SI-NEXT: v_writelane_b32 v8, s35, 3 -; SI-NEXT: v_writelane_b32 v8, s36, 4 -; SI-NEXT: v_writelane_b32 v8, s37, 5 -; SI-NEXT: v_writelane_b32 v8, s38, 6 -; SI-NEXT: v_writelane_b32 v8, s39, 7 -; SI-NEXT: v_writelane_b32 v8, s48, 8 -; SI-NEXT: v_writelane_b32 v8, s49, 9 +; SI-NEXT: v_writelane_b32 v8, s34, 0 +; SI-NEXT: v_writelane_b32 v8, s35, 1 +; SI-NEXT: v_writelane_b32 v8, s36, 2 +; SI-NEXT: v_writelane_b32 v8, s37, 3 +; SI-NEXT: v_writelane_b32 v8, s38, 4 +; SI-NEXT: v_writelane_b32 v8, s39, 5 +; SI-NEXT: v_writelane_b32 v8, s48, 6 +; SI-NEXT: v_writelane_b32 v8, s49, 7 +; SI-NEXT: v_writelane_b32 v8, s50, 8 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_writelane_b32 v8, s50, 10 +; SI-NEXT: v_writelane_b32 v8, s30, 9 ; SI-NEXT: v_readfirstlane_b32 s39, v6 ; SI-NEXT: v_readfirstlane_b32 s48, v5 ; SI-NEXT: v_readfirstlane_b32 s49, v4 @@ -16544,6 +16543,7 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: v_readfirstlane_b32 s35, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s38, v1 +; SI-NEXT: v_writelane_b32 v8, s31, 10 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -16815,18 +16815,18 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s30, v8, 9 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s50, v8, 10 -; SI-NEXT: v_readlane_b32 s49, v8, 9 -; SI-NEXT: v_readlane_b32 s48, v8, 8 -; SI-NEXT: v_readlane_b32 s39, v8, 7 -; SI-NEXT: v_readlane_b32 s38, v8, 6 -; SI-NEXT: v_readlane_b32 s37, v8, 5 -; SI-NEXT: v_readlane_b32 s36, v8, 4 -; SI-NEXT: v_readlane_b32 s35, v8, 3 -; SI-NEXT: v_readlane_b32 s34, v8, 2 -; SI-NEXT: v_readlane_b32 s31, v8, 1 -; SI-NEXT: v_readlane_b32 s30, v8, 0 +; SI-NEXT: v_readlane_b32 s31, v8, 10 +; SI-NEXT: v_readlane_b32 s50, v8, 8 +; SI-NEXT: v_readlane_b32 s49, v8, 7 +; SI-NEXT: v_readlane_b32 s48, v8, 6 +; SI-NEXT: v_readlane_b32 s39, v8, 5 +; SI-NEXT: v_readlane_b32 s38, v8, 4 +; SI-NEXT: v_readlane_b32 s37, v8, 3 +; SI-NEXT: v_readlane_b32 s36, v8, 2 +; SI-NEXT: v_readlane_b32 s35, v8, 1 +; SI-NEXT: v_readlane_b32 s34, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 0329f23ea434f..6bbfcc9f52d95 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -118,32 +118,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s34, 0 +; CHECK-NEXT: v_writelane_b32 v43, s35, 1 +; CHECK-NEXT: v_writelane_b32 v43, s36, 2 +; CHECK-NEXT: v_writelane_b32 v43, s37, 3 +; CHECK-NEXT: v_writelane_b32 v43, s38, 4 +; CHECK-NEXT: v_writelane_b32 v43, s39, 5 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s48, 8 -; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s50, 10 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s51, 11 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v43, s52, 12 +; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] -; CHECK-NEXT: v_writelane_b32 v43, s53, 13 +; CHECK-NEXT: v_writelane_b32 v43, s31, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_mov_b32 s50, s15 @@ -177,21 +177,21 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) { ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s53, v43, 13 -; CHECK-NEXT: v_readlane_b32 s52, v43, 12 -; CHECK-NEXT: v_readlane_b32 s51, v43, 11 -; CHECK-NEXT: v_readlane_b32 s50, v43, 10 -; CHECK-NEXT: v_readlane_b32 s49, v43, 9 -; CHECK-NEXT: v_readlane_b32 s48, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s31, v43, 13 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 +; CHECK-NEXT: v_readlane_b32 s39, v43, 5 +; CHECK-NEXT: v_readlane_b32 s38, v43, 4 +; CHECK-NEXT: v_readlane_b32 s37, v43, 3 +; CHECK-NEXT: v_readlane_b32 s36, v43, 2 +; CHECK-NEXT: v_readlane_b32 s35, v43, 1 +; CHECK-NEXT: v_readlane_b32 s34, v43, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -258,30 +258,30 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s34, 0 +; CHECK-NEXT: v_writelane_b32 v43, s35, 1 +; CHECK-NEXT: v_writelane_b32 v43, s36, 2 +; CHECK-NEXT: v_writelane_b32 v43, s37, 3 +; CHECK-NEXT: v_writelane_b32 v43, s38, 4 +; CHECK-NEXT: v_writelane_b32 v43, s39, 5 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s48, 8 -; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s50, 10 -; CHECK-NEXT: v_writelane_b32 v43, s51, 11 -; CHECK-NEXT: v_writelane_b32 v43, s52, 12 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 +; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s53, 13 +; CHECK-NEXT: v_writelane_b32 v43, s31, 13 ; CHECK-NEXT: v_mov_b32_e32 v42, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v3 ; CHECK-NEXT: v_mov_b32_e32 v40, v2 @@ -313,20 +313,20 @@ define double @test_powr_fast_f64(double %x, double %y) { ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s53, v43, 13 -; CHECK-NEXT: v_readlane_b32 s52, v43, 12 -; CHECK-NEXT: v_readlane_b32 s51, v43, 11 -; CHECK-NEXT: v_readlane_b32 s50, v43, 10 -; CHECK-NEXT: v_readlane_b32 s49, v43, 9 -; CHECK-NEXT: v_readlane_b32 s48, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s30, v43, 12 +; CHECK-NEXT: v_readlane_b32 s31, v43, 13 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 +; CHECK-NEXT: v_readlane_b32 s39, v43, 5 +; CHECK-NEXT: v_readlane_b32 s38, v43, 4 +; CHECK-NEXT: v_readlane_b32 s37, v43, 3 +; CHECK-NEXT: v_readlane_b32 s36, v43, 2 +; CHECK-NEXT: v_readlane_b32 s35, v43, 1 +; CHECK-NEXT: v_readlane_b32 s34, v43, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -400,32 +400,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s34, 0 +; CHECK-NEXT: v_writelane_b32 v43, s35, 1 +; CHECK-NEXT: v_writelane_b32 v43, s36, 2 +; CHECK-NEXT: v_writelane_b32 v43, s37, 3 +; CHECK-NEXT: v_writelane_b32 v43, s38, 4 +; CHECK-NEXT: v_writelane_b32 v43, s39, 5 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s48, 8 -; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s50, 10 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s51, 11 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_mov_b32_e32 v42, v1 -; CHECK-NEXT: v_writelane_b32 v43, s52, 12 +; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] -; CHECK-NEXT: v_writelane_b32 v43, s53, 13 +; CHECK-NEXT: v_writelane_b32 v43, s31, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: v_mov_b32_e32 v41, v2 ; CHECK-NEXT: s_mov_b32 s50, s15 @@ -459,21 +459,21 @@ define double @test_pown_fast_f64(double %x, i32 %y) { ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s53, v43, 13 -; CHECK-NEXT: v_readlane_b32 s52, v43, 12 -; CHECK-NEXT: v_readlane_b32 s51, v43, 11 -; CHECK-NEXT: v_readlane_b32 s50, v43, 10 -; CHECK-NEXT: v_readlane_b32 s49, v43, 9 -; CHECK-NEXT: v_readlane_b32 s48, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s31, v43, 13 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 +; CHECK-NEXT: v_readlane_b32 s39, v43, 5 +; CHECK-NEXT: v_readlane_b32 s38, v43, 4 +; CHECK-NEXT: v_readlane_b32 s37, v43, 3 +; CHECK-NEXT: v_readlane_b32 s36, v43, 2 +; CHECK-NEXT: v_readlane_b32 s35, v43, 1 +; CHECK-NEXT: v_readlane_b32 s34, v43, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -542,30 +542,30 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v42, s16, 14 -; CHECK-NEXT: v_writelane_b32 v42, s30, 0 -; CHECK-NEXT: v_writelane_b32 v42, s31, 1 -; CHECK-NEXT: v_writelane_b32 v42, s34, 2 -; CHECK-NEXT: v_writelane_b32 v42, s35, 3 -; CHECK-NEXT: v_writelane_b32 v42, s36, 4 -; CHECK-NEXT: v_writelane_b32 v42, s37, 5 -; CHECK-NEXT: v_writelane_b32 v42, s38, 6 -; CHECK-NEXT: v_writelane_b32 v42, s39, 7 +; CHECK-NEXT: v_writelane_b32 v42, s34, 0 +; CHECK-NEXT: v_writelane_b32 v42, s35, 1 +; CHECK-NEXT: v_writelane_b32 v42, s36, 2 +; CHECK-NEXT: v_writelane_b32 v42, s37, 3 +; CHECK-NEXT: v_writelane_b32 v42, s38, 4 +; CHECK-NEXT: v_writelane_b32 v42, s39, 5 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v42, s48, 8 -; CHECK-NEXT: v_writelane_b32 v42, s49, 9 +; CHECK-NEXT: v_writelane_b32 v42, s48, 6 +; CHECK-NEXT: v_writelane_b32 v42, s49, 7 ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v42, s50, 8 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v42, s50, 10 -; CHECK-NEXT: v_writelane_b32 v42, s51, 11 -; CHECK-NEXT: v_writelane_b32 v42, s52, 12 +; CHECK-NEXT: v_writelane_b32 v42, s51, 9 +; CHECK-NEXT: v_writelane_b32 v42, s52, 10 +; CHECK-NEXT: v_writelane_b32 v42, s53, 11 +; CHECK-NEXT: v_writelane_b32 v42, s30, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v42, s53, 13 +; CHECK-NEXT: v_writelane_b32 v42, s31, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b32 s51, s14 @@ -596,20 +596,20 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s53, v42, 13 -; CHECK-NEXT: v_readlane_b32 s52, v42, 12 -; CHECK-NEXT: v_readlane_b32 s51, v42, 11 -; CHECK-NEXT: v_readlane_b32 s50, v42, 10 -; CHECK-NEXT: v_readlane_b32 s49, v42, 9 -; CHECK-NEXT: v_readlane_b32 s48, v42, 8 -; CHECK-NEXT: v_readlane_b32 s39, v42, 7 -; CHECK-NEXT: v_readlane_b32 s38, v42, 6 -; CHECK-NEXT: v_readlane_b32 s37, v42, 5 -; CHECK-NEXT: v_readlane_b32 s36, v42, 4 -; CHECK-NEXT: v_readlane_b32 s35, v42, 3 -; CHECK-NEXT: v_readlane_b32 s34, v42, 2 -; CHECK-NEXT: v_readlane_b32 s31, v42, 1 -; CHECK-NEXT: v_readlane_b32 s30, v42, 0 +; CHECK-NEXT: v_readlane_b32 s30, v42, 12 +; CHECK-NEXT: v_readlane_b32 s31, v42, 13 +; CHECK-NEXT: v_readlane_b32 s53, v42, 11 +; CHECK-NEXT: v_readlane_b32 s52, v42, 10 +; CHECK-NEXT: v_readlane_b32 s51, v42, 9 +; CHECK-NEXT: v_readlane_b32 s50, v42, 8 +; CHECK-NEXT: v_readlane_b32 s49, v42, 7 +; CHECK-NEXT: v_readlane_b32 s48, v42, 6 +; CHECK-NEXT: v_readlane_b32 s39, v42, 5 +; CHECK-NEXT: v_readlane_b32 s38, v42, 4 +; CHECK-NEXT: v_readlane_b32 s37, v42, 3 +; CHECK-NEXT: v_readlane_b32 s36, v42, 2 +; CHECK-NEXT: v_readlane_b32 s35, v42, 1 +; CHECK-NEXT: v_readlane_b32 s34, v42, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v42, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -683,32 +683,32 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: v_writelane_b32 v43, s16, 14 -; CHECK-NEXT: v_writelane_b32 v43, s30, 0 -; CHECK-NEXT: v_writelane_b32 v43, s31, 1 -; CHECK-NEXT: v_writelane_b32 v43, s34, 2 -; CHECK-NEXT: v_writelane_b32 v43, s35, 3 -; CHECK-NEXT: v_writelane_b32 v43, s36, 4 -; CHECK-NEXT: v_writelane_b32 v43, s37, 5 -; CHECK-NEXT: v_writelane_b32 v43, s38, 6 -; CHECK-NEXT: v_writelane_b32 v43, s39, 7 +; CHECK-NEXT: v_writelane_b32 v43, s34, 0 +; CHECK-NEXT: v_writelane_b32 v43, s35, 1 +; CHECK-NEXT: v_writelane_b32 v43, s36, 2 +; CHECK-NEXT: v_writelane_b32 v43, s37, 3 +; CHECK-NEXT: v_writelane_b32 v43, s38, 4 +; CHECK-NEXT: v_writelane_b32 v43, s39, 5 ; CHECK-NEXT: s_addk_i32 s32, 0x800 -; CHECK-NEXT: v_writelane_b32 v43, s48, 8 -; CHECK-NEXT: v_writelane_b32 v43, s49, 9 +; CHECK-NEXT: v_writelane_b32 v43, s48, 6 +; CHECK-NEXT: v_writelane_b32 v43, s49, 7 ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v43, s50, 8 ; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v43, s50, 10 +; CHECK-NEXT: v_writelane_b32 v43, s51, 9 +; CHECK-NEXT: v_writelane_b32 v43, s52, 10 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v43, s51, 11 +; CHECK-NEXT: v_writelane_b32 v43, s53, 11 ; CHECK-NEXT: v_mov_b32_e32 v41, v1 -; CHECK-NEXT: v_writelane_b32 v43, s52, 12 +; CHECK-NEXT: v_writelane_b32 v43, s30, 12 ; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v41 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] -; CHECK-NEXT: v_writelane_b32 v43, s53, 13 +; CHECK-NEXT: v_writelane_b32 v43, s31, 13 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b32 s51, s14 @@ -741,21 +741,21 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) { ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s30, v43, 12 ; CHECK-NEXT: v_or_b32_e32 v1, v2, v1 -; CHECK-NEXT: v_readlane_b32 s53, v43, 13 -; CHECK-NEXT: v_readlane_b32 s52, v43, 12 -; CHECK-NEXT: v_readlane_b32 s51, v43, 11 -; CHECK-NEXT: v_readlane_b32 s50, v43, 10 -; CHECK-NEXT: v_readlane_b32 s49, v43, 9 -; CHECK-NEXT: v_readlane_b32 s48, v43, 8 -; CHECK-NEXT: v_readlane_b32 s39, v43, 7 -; CHECK-NEXT: v_readlane_b32 s38, v43, 6 -; CHECK-NEXT: v_readlane_b32 s37, v43, 5 -; CHECK-NEXT: v_readlane_b32 s36, v43, 4 -; CHECK-NEXT: v_readlane_b32 s35, v43, 3 -; CHECK-NEXT: v_readlane_b32 s34, v43, 2 -; CHECK-NEXT: v_readlane_b32 s31, v43, 1 -; CHECK-NEXT: v_readlane_b32 s30, v43, 0 +; CHECK-NEXT: v_readlane_b32 s31, v43, 13 +; CHECK-NEXT: v_readlane_b32 s53, v43, 11 +; CHECK-NEXT: v_readlane_b32 s52, v43, 10 +; CHECK-NEXT: v_readlane_b32 s51, v43, 9 +; CHECK-NEXT: v_readlane_b32 s50, v43, 8 +; CHECK-NEXT: v_readlane_b32 s49, v43, 7 +; CHECK-NEXT: v_readlane_b32 s48, v43, 6 +; CHECK-NEXT: v_readlane_b32 s39, v43, 5 +; CHECK-NEXT: v_readlane_b32 s38, v43, 4 +; CHECK-NEXT: v_readlane_b32 s37, v43, 3 +; CHECK-NEXT: v_readlane_b32 s36, v43, 2 +; CHECK-NEXT: v_readlane_b32 s35, v43, 1 +; CHECK-NEXT: v_readlane_b32 s34, v43, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v43, 14 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll index 583b6fe0a81ca..3bed751c979c3 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll @@ -214,8 +214,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX8-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload @@ -242,8 +242,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 ; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 ; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 ; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8 @@ -270,8 +270,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload @@ -297,8 +297,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 ; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX9-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload @@ -321,11 +321,12 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 ; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX942-ARCH-FLAT-NEXT: s_nop 1 ; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 ; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 ; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 ; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 ; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload @@ -352,8 +353,8 @@ define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: v_readlane_b32 s31, v3, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index ab9cd8e037734..7d1e47ab8acb0 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -4395,8 +4395,8 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v2, 1 ; GCN-NEXT: v_readlane_b32 s30, v2, 0 +; GCN-NEXT: v_readlane_b32 s31, v2, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4424,10 +4424,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_readlane_b32 s30, v2, 0 ; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v2, 1 -; GFX7-NEXT: v_readlane_b32 s30, v2, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4453,10 +4453,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 -; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4482,10 +4482,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX900-NEXT: v_writelane_b32 v2, s31, 1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 ; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_readlane_b32 s31, v2, 1 -; GFX900-NEXT: v_readlane_b32 s30, v2, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4508,13 +4508,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12 ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_writelane_b32 v4, s31, 1 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 ; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_readlane_b32 s31, v4, 1 -; GFX950-NEXT: v_readlane_b32 s30, v4, 0 ; GFX950-NEXT: s_mov_b32 s32, s33 ; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload @@ -4541,10 +4542,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 -; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4571,10 +4572,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX11-NEXT: v_writelane_b32 v2, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v2, 0 ; GFX11-NEXT: scratch_store_b16 v1, v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 -; GFX11-NEXT: v_readlane_b32 s30, v2, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload @@ -4601,10 +4603,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 ; GFX1250-NEXT: scratch_store_b16 v1, v0, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v4, 1 -; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 ; GFX1250-NEXT: s_mov_b32 s32, s33 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 @@ -4648,8 +4651,8 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v4, 1 ; GCN-NEXT: v_readlane_b32 s30, v4, 0 +; GCN-NEXT: v_readlane_b32 s31, v4, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4679,13 +4682,13 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v3, vcc, 2, v2 +; GFX7-NEXT: v_readlane_b32 s30, v4, 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 -; GFX7-NEXT: v_readlane_b32 s30, v4, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4711,10 +4714,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v2, 1 -; GFX8-NEXT: v_readlane_b32 s30, v2, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4740,10 +4743,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: v_writelane_b32 v2, s31, 1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 ; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_readlane_b32 s31, v2, 1 -; GFX900-NEXT: v_readlane_b32 s30, v2, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4766,13 +4769,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_writelane_b32 v4, s31, 1 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 ; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_readlane_b32 s31, v4, 1 -; GFX950-NEXT: v_readlane_b32 s30, v4, 0 ; GFX950-NEXT: s_mov_b32 s32, s33 ; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload @@ -4799,10 +4803,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v2, 1 -; GFX10-NEXT: v_readlane_b32 s30, v2, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4829,10 +4833,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: v_writelane_b32 v2, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v2, 0 ; GFX11-NEXT: scratch_store_b32 v1, v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v2, 1 -; GFX11-NEXT: v_readlane_b32 s30, v2, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload @@ -4859,10 +4864,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 ; GFX1250-NEXT: scratch_store_b32 v1, v0, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v4, 1 -; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 ; GFX1250-NEXT: s_mov_b32 s32, s33 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 @@ -4908,8 +4914,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v5, 1 ; GCN-NEXT: v_readlane_b32 s30, v5, 0 +; GCN-NEXT: v_readlane_b32 s31, v5, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4942,12 +4948,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v3 +; GFX7-NEXT: v_readlane_b32 s30, v4, 0 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v4, 1 -; GFX7-NEXT: v_readlane_b32 s30, v4, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload @@ -4974,12 +4980,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: buffer_store_short v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v4, 1 -; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5005,12 +5011,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: v_writelane_b32 v3, s31, 1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 ; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_readlane_b32 s31, v3, 1 -; GFX900-NEXT: v_readlane_b32 s30, v3, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5033,16 +5039,17 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX950-NEXT: v_writelane_b32 v5, s30, 0 -; GFX950-NEXT: v_writelane_b32 v5, s31, 1 ; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 ; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_readlane_b32 s31, v5, 1 -; GFX950-NEXT: v_readlane_b32 s30, v5, 0 ; GFX950-NEXT: s_mov_b32 s32, s33 ; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload @@ -5069,12 +5076,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 -; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5101,12 +5108,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: v_writelane_b32 v3, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v3, 0 ; GFX11-NEXT: scratch_store_b16 v2, v1, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 v2, v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1 -; GFX11-NEXT: v_readlane_b32 s30, v3, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload @@ -5134,12 +5142,13 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 ; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 -; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 ; GFX1250-NEXT: s_mov_b32 s32, s33 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 @@ -5193,8 +5202,8 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v8, 1 ; GCN-NEXT: v_readlane_b32 s30, v8, 0 +; GCN-NEXT: v_readlane_b32 s31, v8, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5234,13 +5243,13 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v4 +; GFX7-NEXT: v_readlane_b32 s30, v6, 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v4, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v6, 1 -; GFX7-NEXT: v_readlane_b32 s30, v6, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5267,12 +5276,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 +; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v4, 1 -; GFX8-NEXT: v_readlane_b32 s30, v4, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5298,12 +5307,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: v_writelane_b32 v3, s31, 1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 ; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_readlane_b32 s31, v3, 1 -; GFX900-NEXT: v_readlane_b32 s30, v3, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5326,14 +5335,15 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX950-NEXT: v_writelane_b32 v5, s30, 0 -; GFX950-NEXT: v_writelane_b32 v5, s31, 1 ; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 ; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_readlane_b32 s31, v5, 1 -; GFX950-NEXT: v_readlane_b32 s30, v5, 0 ; GFX950-NEXT: s_mov_b32 s32, s33 ; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload @@ -5360,12 +5370,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v3, 1 -; GFX10-NEXT: v_readlane_b32 s30, v3, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5392,10 +5402,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: v_writelane_b32 v3, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v3, 0 ; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v3, 1 -; GFX11-NEXT: v_readlane_b32 s30, v3, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload @@ -5423,10 +5434,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 ; GFX1250-NEXT: scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 -; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 ; GFX1250-NEXT: s_mov_b32 s32, s33 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 @@ -5500,8 +5512,8 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v16, 1 ; GCN-NEXT: v_readlane_b32 s30, v16, 0 +; GCN-NEXT: v_readlane_b32 s31, v16, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5561,13 +5573,13 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v8 +; GFX7-NEXT: v_readlane_b32 s30, v10, 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v8, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v10, 1 -; GFX7-NEXT: v_readlane_b32 s30, v10, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5600,12 +5612,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4 +; GFX8-NEXT: v_readlane_b32 s30, v6, 0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v6, 1 -; GFX8-NEXT: v_readlane_b32 s30, v6, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5631,6 +5643,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: v_writelane_b32 v5, s31, 1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v5, 0 ; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 @@ -5640,7 +5653,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_readlane_b32 s31, v5, 1 -; GFX900-NEXT: v_readlane_b32 s30, v5, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5663,13 +5675,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_writelane_b32 v5, s31, 1 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 ; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_readlane_b32 s31, v5, 1 -; GFX950-NEXT: v_readlane_b32 s30, v5, 0 ; GFX950-NEXT: s_mov_b32 s32, s33 ; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload @@ -5696,6 +5709,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s30, v5, 0 ; GFX10-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 @@ -5705,7 +5719,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v5, 1 -; GFX10-NEXT: v_readlane_b32 s30, v5, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5732,10 +5745,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: v_writelane_b32 v5, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v5, 0 ; GFX11-NEXT: scratch_store_b128 v4, v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v5, 1 -; GFX11-NEXT: v_readlane_b32 s30, v5, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload @@ -5762,10 +5776,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 ; GFX1250-NEXT: scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 -; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 ; GFX1250-NEXT: s_mov_b32 s32, s33 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 @@ -5879,8 +5894,8 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v20, 1 ; GCN-NEXT: v_readlane_b32 s30, v20, 0 +; GCN-NEXT: v_readlane_b32 s31, v20, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload @@ -5980,13 +5995,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX7-NEXT: buffer_store_short v2, v3, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v2, vcc, 2, v16 +; GFX7-NEXT: v_readlane_b32 s30, v18, 0 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_short v0, v16, s[0:3], 0 offen ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readlane_b32 s31, v18, 1 -; GFX7-NEXT: v_readlane_b32 s30, v18, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload @@ -6031,12 +6046,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v8 +; GFX8-NEXT: v_readlane_b32 s30, v10, 0 ; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readlane_b32 s31, v10, 1 -; GFX8-NEXT: v_readlane_b32 s30, v10, 0 ; GFX8-NEXT: s_mov_b32 s32, s33 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload @@ -6062,6 +6077,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: v_writelane_b32 v9, s31, 1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: v_readlane_b32 s30, v9, 0 ; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 @@ -6079,7 +6095,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_readlane_b32 s31, v9, 1 -; GFX900-NEXT: v_readlane_b32 s30, v9, 0 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload @@ -6102,15 +6117,16 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 ; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX950-NEXT: v_writelane_b32 v9, s30, 0 +; GFX950-NEXT: s_nop 1 ; GFX950-NEXT: v_writelane_b32 v9, s31, 1 ; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: v_readlane_b32 s30, v9, 0 ; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1 ; GFX950-NEXT: s_waitcnt vmcnt(0) ; GFX950-NEXT: v_readlane_b32 s31, v9, 1 -; GFX950-NEXT: v_readlane_b32 s30, v9, 0 ; GFX950-NEXT: s_mov_b32 s32, s33 ; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload @@ -6137,6 +6153,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s30, v9, 0 ; GFX10-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 @@ -6154,7 +6171,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX10-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_readlane_b32 s31, v9, 1 -; GFX10-NEXT: v_readlane_b32 s30, v9, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload @@ -6181,12 +6197,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: v_writelane_b32 v9, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v9, 0 ; GFX11-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b128 v8, v[0:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_readlane_b32 s31, v9, 1 -; GFX11-NEXT: v_readlane_b32 s30, v9, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload @@ -6213,12 +6230,13 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX1250-NEXT: v_writelane_b32 v9, s31, 1 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readlane_b32 s30, v9, 0 ; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: v_readlane_b32 s31, v9, 1 -; GFX1250-NEXT: v_readlane_b32 s30, v9, 0 ; GFX1250-NEXT: s_mov_b32 s32, s33 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 @@ -48879,34 +48897,34 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_and_b32_e32 v0, 1, v20 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX8-NEXT: v_writelane_b32 v34, s34, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX8-NEXT: v_writelane_b32 v34, s35, 1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX8-NEXT: v_writelane_b32 v34, s36, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX8-NEXT: v_writelane_b32 v34, s37, 3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX8-NEXT: v_writelane_b32 v34, s30, 0 +; GFX8-NEXT: v_writelane_b32 v34, s38, 4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX8-NEXT: v_writelane_b32 v34, s31, 1 +; GFX8-NEXT: v_writelane_b32 v34, s39, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX8-NEXT: v_writelane_b32 v34, s34, 2 +; GFX8-NEXT: v_writelane_b32 v34, s30, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX8-NEXT: v_writelane_b32 v34, s35, 3 +; GFX8-NEXT: v_writelane_b32 v34, s31, 7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX8-NEXT: v_writelane_b32 v34, s36, 4 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX8-NEXT: v_writelane_b32 v34, s37, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0 ; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX8-NEXT: v_writelane_b32 v34, s38, 6 -; GFX8-NEXT: v_writelane_b32 v34, s39, 7 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0 @@ -49032,6 +49050,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v28 ; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v26 ; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; GFX8-NEXT: v_readlane_b32 s30, v34, 6 ; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -49040,14 +49059,13 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_readlane_b32 s39, v34, 7 -; GFX8-NEXT: v_readlane_b32 s38, v34, 6 -; GFX8-NEXT: v_readlane_b32 s37, v34, 5 -; GFX8-NEXT: v_readlane_b32 s36, v34, 4 -; GFX8-NEXT: v_readlane_b32 s35, v34, 3 -; GFX8-NEXT: v_readlane_b32 s34, v34, 2 -; GFX8-NEXT: v_readlane_b32 s31, v34, 1 -; GFX8-NEXT: v_readlane_b32 s30, v34, 0 +; GFX8-NEXT: v_readlane_b32 s31, v34, 7 +; GFX8-NEXT: v_readlane_b32 s39, v34, 5 +; GFX8-NEXT: v_readlane_b32 s38, v34, 4 +; GFX8-NEXT: v_readlane_b32 s37, v34, 3 +; GFX8-NEXT: v_readlane_b32 s36, v34, 2 +; GFX8-NEXT: v_readlane_b32 s35, v34, 1 +; GFX8-NEXT: v_readlane_b32 s34, v34, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] @@ -49119,11 +49137,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX900-NEXT: v_and_b32_e32 v0, 1, v28 ; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX900-NEXT: v_writelane_b32 v33, s30, 0 -; GFX900-NEXT: v_writelane_b32 v33, s31, 1 -; GFX900-NEXT: v_writelane_b32 v33, s34, 2 +; GFX900-NEXT: v_writelane_b32 v33, s34, 0 +; GFX900-NEXT: v_writelane_b32 v33, s35, 1 +; GFX900-NEXT: v_writelane_b32 v33, s30, 2 +; GFX900-NEXT: v_writelane_b32 v33, s31, 3 ; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX900-NEXT: v_writelane_b32 v33, s35, 3 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 @@ -49228,6 +49246,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_readlane_b32 s30, v33, 2 ; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 ; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4 ; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4 @@ -49244,10 +49263,9 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4 ; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4 ; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4 -; GFX900-NEXT: v_readlane_b32 s35, v33, 3 -; GFX900-NEXT: v_readlane_b32 s34, v33, 2 -; GFX900-NEXT: v_readlane_b32 s31, v33, 1 -; GFX900-NEXT: v_readlane_b32 s30, v33, 0 +; GFX900-NEXT: v_readlane_b32 s31, v33, 3 +; GFX900-NEXT: v_readlane_b32 s35, v33, 1 +; GFX900-NEXT: v_readlane_b32 s34, v33, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll index ab2ad19d0f1bf..fb11d3b7d9d65 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -902,47 +902,47 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt expcnt(0) -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 -; CHECK-NEXT: v_writelane_b32 v0, s33, 2 -; CHECK-NEXT: v_writelane_b32 v0, s34, 3 -; CHECK-NEXT: v_writelane_b32 v0, s35, 4 -; CHECK-NEXT: v_writelane_b32 v0, s36, 5 -; CHECK-NEXT: v_writelane_b32 v0, s37, 6 -; CHECK-NEXT: v_writelane_b32 v0, s38, 7 -; CHECK-NEXT: v_writelane_b32 v0, s39, 8 -; CHECK-NEXT: v_writelane_b32 v0, s48, 9 -; CHECK-NEXT: v_writelane_b32 v0, s49, 10 -; CHECK-NEXT: v_writelane_b32 v0, s50, 11 -; CHECK-NEXT: v_writelane_b32 v0, s51, 12 -; CHECK-NEXT: v_writelane_b32 v0, s52, 13 -; CHECK-NEXT: v_writelane_b32 v0, s53, 14 -; CHECK-NEXT: v_writelane_b32 v0, s54, 15 -; CHECK-NEXT: v_writelane_b32 v0, s55, 16 -; CHECK-NEXT: v_writelane_b32 v0, s64, 17 -; CHECK-NEXT: v_writelane_b32 v0, s65, 18 -; CHECK-NEXT: v_writelane_b32 v0, s66, 19 -; CHECK-NEXT: v_writelane_b32 v0, s67, 20 -; CHECK-NEXT: v_writelane_b32 v0, s68, 21 -; CHECK-NEXT: v_writelane_b32 v0, s69, 22 -; CHECK-NEXT: v_writelane_b32 v0, s70, 23 -; CHECK-NEXT: v_writelane_b32 v0, s71, 24 -; CHECK-NEXT: v_writelane_b32 v0, s80, 25 -; CHECK-NEXT: v_writelane_b32 v0, s81, 26 -; CHECK-NEXT: v_writelane_b32 v0, s82, 27 -; CHECK-NEXT: v_writelane_b32 v0, s83, 28 -; CHECK-NEXT: v_writelane_b32 v0, s84, 29 -; CHECK-NEXT: v_writelane_b32 v0, s85, 30 -; CHECK-NEXT: v_writelane_b32 v0, s86, 31 -; CHECK-NEXT: v_writelane_b32 v0, s87, 32 -; CHECK-NEXT: v_writelane_b32 v0, s96, 33 -; CHECK-NEXT: v_writelane_b32 v0, s97, 34 -; CHECK-NEXT: v_writelane_b32 v0, s98, 35 -; CHECK-NEXT: v_writelane_b32 v0, s99, 36 +; CHECK-NEXT: v_writelane_b32 v0, s33, 0 +; CHECK-NEXT: v_writelane_b32 v0, s34, 1 +; CHECK-NEXT: v_writelane_b32 v0, s35, 2 +; CHECK-NEXT: v_writelane_b32 v0, s36, 3 +; CHECK-NEXT: v_writelane_b32 v0, s37, 4 +; CHECK-NEXT: v_writelane_b32 v0, s38, 5 +; CHECK-NEXT: v_writelane_b32 v0, s39, 6 +; CHECK-NEXT: v_writelane_b32 v0, s48, 7 +; CHECK-NEXT: v_writelane_b32 v0, s49, 8 +; CHECK-NEXT: v_writelane_b32 v0, s50, 9 +; CHECK-NEXT: v_writelane_b32 v0, s51, 10 +; CHECK-NEXT: v_writelane_b32 v0, s52, 11 +; CHECK-NEXT: v_writelane_b32 v0, s53, 12 +; CHECK-NEXT: v_writelane_b32 v0, s54, 13 +; CHECK-NEXT: v_writelane_b32 v0, s55, 14 +; CHECK-NEXT: v_writelane_b32 v0, s64, 15 +; CHECK-NEXT: v_writelane_b32 v0, s65, 16 +; CHECK-NEXT: v_writelane_b32 v0, s66, 17 +; CHECK-NEXT: v_writelane_b32 v0, s67, 18 +; CHECK-NEXT: v_writelane_b32 v0, s68, 19 +; CHECK-NEXT: v_writelane_b32 v0, s69, 20 +; CHECK-NEXT: v_writelane_b32 v0, s70, 21 +; CHECK-NEXT: v_writelane_b32 v0, s71, 22 +; CHECK-NEXT: v_writelane_b32 v0, s80, 23 +; CHECK-NEXT: v_writelane_b32 v0, s81, 24 +; CHECK-NEXT: v_writelane_b32 v0, s82, 25 +; CHECK-NEXT: v_writelane_b32 v0, s83, 26 +; CHECK-NEXT: v_writelane_b32 v0, s84, 27 +; CHECK-NEXT: v_writelane_b32 v0, s85, 28 +; CHECK-NEXT: v_writelane_b32 v0, s86, 29 +; CHECK-NEXT: v_writelane_b32 v0, s87, 30 +; CHECK-NEXT: v_writelane_b32 v0, s96, 31 +; CHECK-NEXT: v_writelane_b32 v0, s97, 32 +; CHECK-NEXT: v_writelane_b32 v0, s98, 33 +; CHECK-NEXT: v_writelane_b32 v0, s99, 34 +; CHECK-NEXT: v_writelane_b32 v0, s100, 35 +; CHECK-NEXT: v_writelane_b32 v0, s101, 36 ; CHECK-NEXT: s_mov_b32 s40, s12 -; CHECK-NEXT: v_writelane_b32 v0, s100, 37 +; CHECK-NEXT: v_writelane_b32 v0, s30, 37 ; CHECK-NEXT: s_cmp_eq_u32 s40, 0 -; CHECK-NEXT: v_writelane_b32 v0, s101, 38 +; CHECK-NEXT: v_writelane_b32 v0, s31, 38 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ;;#ASMEND @@ -1380,6 +1380,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s31 ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s30, v0, 37 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use s32 ; CHECK-NEXT: ;;#ASMEND @@ -1596,45 +1597,44 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; reg use vcc_hi ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s101, v0, 38 -; CHECK-NEXT: v_readlane_b32 s100, v0, 37 -; CHECK-NEXT: v_readlane_b32 s99, v0, 36 -; CHECK-NEXT: v_readlane_b32 s98, v0, 35 -; CHECK-NEXT: v_readlane_b32 s97, v0, 34 -; CHECK-NEXT: v_readlane_b32 s96, v0, 33 -; CHECK-NEXT: v_readlane_b32 s87, v0, 32 -; CHECK-NEXT: v_readlane_b32 s86, v0, 31 -; CHECK-NEXT: v_readlane_b32 s85, v0, 30 -; CHECK-NEXT: v_readlane_b32 s84, v0, 29 -; CHECK-NEXT: v_readlane_b32 s83, v0, 28 -; CHECK-NEXT: v_readlane_b32 s82, v0, 27 -; CHECK-NEXT: v_readlane_b32 s81, v0, 26 -; CHECK-NEXT: v_readlane_b32 s80, v0, 25 -; CHECK-NEXT: v_readlane_b32 s71, v0, 24 -; CHECK-NEXT: v_readlane_b32 s70, v0, 23 -; CHECK-NEXT: v_readlane_b32 s69, v0, 22 -; CHECK-NEXT: v_readlane_b32 s68, v0, 21 -; CHECK-NEXT: v_readlane_b32 s67, v0, 20 -; CHECK-NEXT: v_readlane_b32 s66, v0, 19 -; CHECK-NEXT: v_readlane_b32 s65, v0, 18 -; CHECK-NEXT: v_readlane_b32 s64, v0, 17 -; CHECK-NEXT: v_readlane_b32 s55, v0, 16 -; CHECK-NEXT: v_readlane_b32 s54, v0, 15 -; CHECK-NEXT: v_readlane_b32 s53, v0, 14 -; CHECK-NEXT: v_readlane_b32 s52, v0, 13 -; CHECK-NEXT: v_readlane_b32 s51, v0, 12 -; CHECK-NEXT: v_readlane_b32 s50, v0, 11 -; CHECK-NEXT: v_readlane_b32 s49, v0, 10 -; CHECK-NEXT: v_readlane_b32 s48, v0, 9 -; CHECK-NEXT: v_readlane_b32 s39, v0, 8 -; CHECK-NEXT: v_readlane_b32 s38, v0, 7 -; CHECK-NEXT: v_readlane_b32 s37, v0, 6 -; CHECK-NEXT: v_readlane_b32 s36, v0, 5 -; CHECK-NEXT: v_readlane_b32 s35, v0, 4 -; CHECK-NEXT: v_readlane_b32 s34, v0, 3 -; CHECK-NEXT: v_readlane_b32 s33, v0, 2 -; CHECK-NEXT: v_readlane_b32 s31, v0, 1 -; CHECK-NEXT: v_readlane_b32 s30, v0, 0 +; CHECK-NEXT: v_readlane_b32 s31, v0, 38 +; CHECK-NEXT: v_readlane_b32 s101, v0, 36 +; CHECK-NEXT: v_readlane_b32 s100, v0, 35 +; CHECK-NEXT: v_readlane_b32 s99, v0, 34 +; CHECK-NEXT: v_readlane_b32 s98, v0, 33 +; CHECK-NEXT: v_readlane_b32 s97, v0, 32 +; CHECK-NEXT: v_readlane_b32 s96, v0, 31 +; CHECK-NEXT: v_readlane_b32 s87, v0, 30 +; CHECK-NEXT: v_readlane_b32 s86, v0, 29 +; CHECK-NEXT: v_readlane_b32 s85, v0, 28 +; CHECK-NEXT: v_readlane_b32 s84, v0, 27 +; CHECK-NEXT: v_readlane_b32 s83, v0, 26 +; CHECK-NEXT: v_readlane_b32 s82, v0, 25 +; CHECK-NEXT: v_readlane_b32 s81, v0, 24 +; CHECK-NEXT: v_readlane_b32 s80, v0, 23 +; CHECK-NEXT: v_readlane_b32 s71, v0, 22 +; CHECK-NEXT: v_readlane_b32 s70, v0, 21 +; CHECK-NEXT: v_readlane_b32 s69, v0, 20 +; CHECK-NEXT: v_readlane_b32 s68, v0, 19 +; CHECK-NEXT: v_readlane_b32 s67, v0, 18 +; CHECK-NEXT: v_readlane_b32 s66, v0, 17 +; CHECK-NEXT: v_readlane_b32 s65, v0, 16 +; CHECK-NEXT: v_readlane_b32 s64, v0, 15 +; CHECK-NEXT: v_readlane_b32 s55, v0, 14 +; CHECK-NEXT: v_readlane_b32 s54, v0, 13 +; CHECK-NEXT: v_readlane_b32 s53, v0, 12 +; CHECK-NEXT: v_readlane_b32 s52, v0, 11 +; CHECK-NEXT: v_readlane_b32 s51, v0, 10 +; CHECK-NEXT: v_readlane_b32 s50, v0, 9 +; CHECK-NEXT: v_readlane_b32 s49, v0, 8 +; CHECK-NEXT: v_readlane_b32 s48, v0, 7 +; CHECK-NEXT: v_readlane_b32 s39, v0, 6 +; CHECK-NEXT: v_readlane_b32 s38, v0, 5 +; CHECK-NEXT: v_readlane_b32 s37, v0, 4 +; CHECK-NEXT: v_readlane_b32 s36, v0, 3 +; CHECK-NEXT: v_readlane_b32 s35, v0, 2 +; CHECK-NEXT: v_readlane_b32 s34, v0, 1 +; CHECK-NEXT: v_readlane_b32 s33, v0, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index c407f7645315d..d86fbb9754c02 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -7178,8 +7178,8 @@ define void @stack_12xv3i32() #0 { ; VI-NEXT: v_mov_b32_e32 v30, 10 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7246,8 +7246,8 @@ define void @stack_12xv3i32() #0 { ; CI-NEXT: v_mov_b32_e32 v30, 10 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7314,8 +7314,8 @@ define void @stack_12xv3i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7364,8 +7364,8 @@ define void @stack_12xv3i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7432,8 +7432,8 @@ define void @stack_12xv3i32() #0 { ; HSA-NEXT: v_mov_b32_e32 v30, 10 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7517,8 +7517,8 @@ define void @stack_12xv3f32() #0 { ; VI-NEXT: v_mov_b32_e32 v30, 0x41200000 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7585,8 +7585,8 @@ define void @stack_12xv3f32() #0 { ; CI-NEXT: v_mov_b32_e32 v30, 0x41200000 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7653,8 +7653,8 @@ define void @stack_12xv3f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7707,8 +7707,8 @@ define void @stack_12xv3f32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7775,8 +7775,8 @@ define void @stack_12xv3f32() #0 { ; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7868,8 +7868,8 @@ define void @stack_8xv5i32() #0 { ; VI-NEXT: v_mov_b32_e32 v30, 6 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -7944,8 +7944,8 @@ define void @stack_8xv5i32() #0 { ; CI-NEXT: v_mov_b32_e32 v30, 6 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -8020,8 +8020,8 @@ define void @stack_8xv5i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -8075,8 +8075,8 @@ define void @stack_8xv5i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8151,8 +8151,8 @@ define void @stack_8xv5i32() #0 { ; HSA-NEXT: v_mov_b32_e32 v30, 6 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -8240,8 +8240,8 @@ define void @stack_8xv5f32() #0 { ; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: s_mov_b32 s32, s33 ; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -8316,8 +8316,8 @@ define void @stack_8xv5f32() #0 { ; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: s_mov_b32 s32, s33 ; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -8392,8 +8392,8 @@ define void @stack_8xv5f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -8450,8 +8450,8 @@ define void @stack_8xv5f32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8526,8 +8526,8 @@ define void @stack_8xv5f32() #0 { ; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] -; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: s_mov_b32 s32, s33 ; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index b250227735bd3..26727e53d990c 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -25,8 +25,8 @@ define void @use_vcc() #1 { ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 +; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s4, v40, 2 ; GCN: s_mov_b32 s33, s4 ; GCN: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index aed1079158154..f9070339093da 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -40,22 +40,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 -; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 0 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 -; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 2 ; MUBUF-NEXT: s_getpc_b64 s[34:35] ; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 3 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] -; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 -; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 -; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s35, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -74,22 +74,22 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2 ; FLATSCR-NEXT: s_getpc_b64 s[34:35] ; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] -; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -114,20 +114,20 @@ define void @test_func_call_external_void_funcx2() #0 { ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 -; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 0 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 -; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 2 ; MUBUF-NEXT: s_getpc_b64 s[34:35] ; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 3 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] -; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 -; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 -; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s35, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -146,20 +146,20 @@ define void @test_func_call_external_void_funcx2() #0 { ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 2 ; FLATSCR-NEXT: s_getpc_b64 s[34:35] ; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 3 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] -; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -185,8 +185,8 @@ define void @void_func_void_clobber_s30_s31() #2 { ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s31, v0, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v0, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v0, 1 ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] @@ -204,8 +204,8 @@ define void @void_func_void_clobber_s30_s31() #2 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] @@ -452,23 +452,23 @@ define void @callee_saved_sgpr_func() #2 { ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: v_writelane_b32 v40, s4, 3 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 2 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; def s40 ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: s_mov_b32 s34, s40 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: v_readlane_b32 s30, v40, 1 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; use s34 ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 -; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 3 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -488,23 +488,23 @@ define void @callee_saved_sgpr_func() #2 { ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 1 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 2 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def s40 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: s_mov_b32 s34, s40 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 1 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s34 ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -555,13 +555,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: v_writelane_b32 v41, s4, 3 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v41, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v41, s31, 1 +; MUBUF-NEXT: v_writelane_b32 v41, s34, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s30, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: v_writelane_b32 v41, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v41, s31, 2 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; def s40 ; MUBUF-NEXT: ;;#ASMEND @@ -577,9 +577,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ; MUBUF-NEXT: ; use v40 ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: v_readlane_b32 s34, v41, 2 -; MUBUF-NEXT: v_readlane_b32 s31, v41, 1 -; MUBUF-NEXT: v_readlane_b32 s30, v41, 0 +; MUBUF-NEXT: v_readlane_b32 s30, v41, 1 +; MUBUF-NEXT: v_readlane_b32 s31, v41, 2 +; MUBUF-NEXT: v_readlane_b32 s34, v41, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v41, 3 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -599,13 +599,13 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v41, s34, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s30, 1 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: v_writelane_b32 v41, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s31, 2 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; def s40 ; FLATSCR-NEXT: ;;#ASMEND @@ -621,9 +621,9 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ; FLATSCR-NEXT: ; use v40 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: v_readlane_b32 s34, v41, 2 -; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0 +; FLATSCR-NEXT: v_readlane_b32 s30, v41, 1 +; FLATSCR-NEXT: v_readlane_b32 s31, v41, 2 +; FLATSCR-NEXT: v_readlane_b32 s34, v41, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index e7254eb5c3465..07f58df81c502 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -132,8 +132,8 @@ define void @callee_with_stack_and_call() #0 { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -162,8 +162,8 @@ define void @callee_with_stack_and_call() #0 { ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -201,8 +201,8 @@ define void @callee_no_stack_with_call() #0 { ; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -228,8 +228,8 @@ define void @callee_no_stack_with_call() #0 { ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -359,24 +359,24 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 -; FLATSCR-NEXT: v_writelane_b32 v40, s36, 2 -; FLATSCR-NEXT: v_writelane_b32 v40, s37, 3 -; FLATSCR-NEXT: v_writelane_b32 v40, s38, 4 -; FLATSCR-NEXT: v_writelane_b32 v40, s39, 5 -; FLATSCR-NEXT: v_writelane_b32 v40, s48, 6 -; FLATSCR-NEXT: v_writelane_b32 v40, s49, 7 -; FLATSCR-NEXT: v_writelane_b32 v40, s50, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s51, 9 -; FLATSCR-NEXT: v_writelane_b32 v40, s52, 10 -; FLATSCR-NEXT: v_writelane_b32 v40, s53, 11 -; FLATSCR-NEXT: v_writelane_b32 v40, s54, 12 -; FLATSCR-NEXT: v_writelane_b32 v40, s55, 13 -; FLATSCR-NEXT: v_writelane_b32 v40, s64, 14 -; FLATSCR-NEXT: v_writelane_b32 v40, s65, 15 -; FLATSCR-NEXT: v_writelane_b32 v40, s66, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s67, 17 +; FLATSCR-NEXT: v_writelane_b32 v40, s36, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s37, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s38, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s39, 3 +; FLATSCR-NEXT: v_writelane_b32 v40, s48, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s49, 5 +; FLATSCR-NEXT: v_writelane_b32 v40, s50, 6 +; FLATSCR-NEXT: v_writelane_b32 v40, s51, 7 +; FLATSCR-NEXT: v_writelane_b32 v40, s52, 8 +; FLATSCR-NEXT: v_writelane_b32 v40, s53, 9 +; FLATSCR-NEXT: v_writelane_b32 v40, s54, 10 +; FLATSCR-NEXT: v_writelane_b32 v40, s55, 11 +; FLATSCR-NEXT: v_writelane_b32 v40, s64, 12 +; FLATSCR-NEXT: v_writelane_b32 v40, s65, 13 +; FLATSCR-NEXT: v_writelane_b32 v40, s66, 14 +; FLATSCR-NEXT: v_writelane_b32 v40, s67, 15 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 17 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: ;;#ASMSTART @@ -414,6 +414,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[16:31] ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 16 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[72:79] ; FLATSCR-NEXT: ;;#ASMEND @@ -423,24 +424,23 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; use s[0:15] ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s67, v40, 17 -; FLATSCR-NEXT: v_readlane_b32 s66, v40, 16 -; FLATSCR-NEXT: v_readlane_b32 s65, v40, 15 -; FLATSCR-NEXT: v_readlane_b32 s64, v40, 14 -; FLATSCR-NEXT: v_readlane_b32 s55, v40, 13 -; FLATSCR-NEXT: v_readlane_b32 s54, v40, 12 -; FLATSCR-NEXT: v_readlane_b32 s53, v40, 11 -; FLATSCR-NEXT: v_readlane_b32 s52, v40, 10 -; FLATSCR-NEXT: v_readlane_b32 s51, v40, 9 -; FLATSCR-NEXT: v_readlane_b32 s50, v40, 8 -; FLATSCR-NEXT: v_readlane_b32 s49, v40, 7 -; FLATSCR-NEXT: v_readlane_b32 s48, v40, 6 -; FLATSCR-NEXT: v_readlane_b32 s39, v40, 5 -; FLATSCR-NEXT: v_readlane_b32 s38, v40, 4 -; FLATSCR-NEXT: v_readlane_b32 s37, v40, 3 -; FLATSCR-NEXT: v_readlane_b32 s36, v40, 2 -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 17 +; FLATSCR-NEXT: v_readlane_b32 s67, v40, 15 +; FLATSCR-NEXT: v_readlane_b32 s66, v40, 14 +; FLATSCR-NEXT: v_readlane_b32 s65, v40, 13 +; FLATSCR-NEXT: v_readlane_b32 s64, v40, 12 +; FLATSCR-NEXT: v_readlane_b32 s55, v40, 11 +; FLATSCR-NEXT: v_readlane_b32 s54, v40, 10 +; FLATSCR-NEXT: v_readlane_b32 s53, v40, 9 +; FLATSCR-NEXT: v_readlane_b32 s52, v40, 8 +; FLATSCR-NEXT: v_readlane_b32 s51, v40, 7 +; FLATSCR-NEXT: v_readlane_b32 s50, v40, 6 +; FLATSCR-NEXT: v_readlane_b32 s49, v40, 5 +; FLATSCR-NEXT: v_readlane_b32 s48, v40, 4 +; FLATSCR-NEXT: v_readlane_b32 s39, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s38, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s37, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s36, v40, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s32 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] @@ -971,14 +971,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: v_writelane_b32 v1, s30, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: s_addk_i32 s32, 0x300 ; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x300 ; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -997,14 +997,14 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_add_i32 s32, s32, 12 ; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND -; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 ; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload @@ -1037,17 +1037,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; MUBUF-NEXT: s_mov_b64 exec, s[4:5] ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: s_addk_i32 s32, 0x300 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_addk_i32 s32, 0x300 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved initial VGPRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 -; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -1066,17 +1066,17 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_add_i32 s32, s32, 12 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_add_i32 s32, s32, 12 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved initial VGPRs ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload @@ -1118,18 +1118,18 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1000 -; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved SGPRs ; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_add_i32 s32, s32, 0x40300 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ; clobber nonpreserved VGPRs ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 -; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_or_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: s_add_i32 s6, s33, 0x40100 @@ -1158,11 +1158,11 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved SGPRs ; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: ;;#ASMSTART ; FLATSCR-NEXT: ; clobber nonpreserved VGPRs ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 -; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_or_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: s_add_i32 s2, s33, 0x1004 @@ -1220,8 +1220,8 @@ define void @ipra_call_with_stack() #0 { ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] -; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -1248,8 +1248,8 @@ define void @ipra_call_with_stack() #0 { ; FLATSCR-NEXT: scratch_store_dword off, v0, s33 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; FLATSCR-NEXT: scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll index 5f965ba431ab5..bb5963244da3c 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -430,8 +430,8 @@ define void @func_indirect_use_workitem_id_x() #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -463,8 +463,8 @@ define void @func_indirect_use_workitem_id_y() #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -496,8 +496,8 @@ define void @func_indirect_use_workitem_id_z() #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -984,8 +984,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GFX7-NEXT: v_writelane_b32 v40, s31, 1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -1048,8 +1048,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-NEXT: s_mov_b32 s32, s33 ; GFX90A-NEXT: v_readlane_b32 s4, v40, 2 ; GFX90A-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -1094,8 +1094,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -1445,8 +1445,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ; GCN-NEXT: v_mov_b32_e32 v0, 10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index bb2f06bfe83f8..f20be656f3af0 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -275,8 +275,8 @@ define void @func_indirect_use_workitem_id_x() #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -308,8 +308,8 @@ define void @func_indirect_use_workitem_id_y() #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -341,8 +341,8 @@ define void @func_indirect_use_workitem_id_z() #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -696,8 +696,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -742,8 +742,8 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -1019,8 +1019,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { ; GCN-NEXT: v_mov_b32_e32 v0, 10 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -1469,8 +1469,8 @@ define void @func_call_no_workitem_id_hints() #2 { ; GCN-NEXT: v_mov_b32_e32 v0, 9 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 38c20c7cf62d6..0cab17c9bfcfc 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -40,8 +40,8 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -76,8 +76,8 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -112,8 +112,8 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -148,8 +148,8 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll index 676144e65c10f..7d38cea998256 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll +++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll @@ -2005,8 +2005,8 @@ define hidden void @func_call_clobber() #0 { ; GFX900-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX900-NEXT: v_writelane_b32 v40, s31, 1 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX900-NEXT: v_readlane_b32 s31, v40, 1 ; GFX900-NEXT: v_readlane_b32 s30, v40, 0 +; GFX900-NEXT: v_readlane_b32 s31, v40, 1 ; GFX900-NEXT: s_mov_b32 s32, s33 ; GFX900-NEXT: v_readlane_b32 s4, v40, 2 ; GFX900-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -2278,8 +2278,8 @@ define hidden void @func_call_clobber() #0 { ; GFX90A-V2A-DIS-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX90A-V2A-DIS-NEXT: v_writelane_b32 v40, s31, 1 ; GFX90A-V2A-DIS-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-V2A-DIS-NEXT: s_mov_b32 s32, s33 ; GFX90A-V2A-DIS-NEXT: v_readlane_b32 s4, v40, 2 ; GFX90A-V2A-DIS-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -2551,8 +2551,8 @@ define hidden void @func_call_clobber() #0 { ; GFX90A-V2A-EN-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX90A-V2A-EN-NEXT: v_writelane_b32 v40, s31, 1 ; GFX90A-V2A-EN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-V2A-EN-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-V2A-EN-NEXT: v_readlane_b32 s31, v40, 1 ; GFX90A-V2A-EN-NEXT: s_mov_b32 s32, s33 ; GFX90A-V2A-EN-NEXT: v_readlane_b32 s4, v40, 2 ; GFX90A-V2A-EN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -2793,8 +2793,8 @@ define hidden void @func_call_clobber() #0 { ; WAVE32-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; WAVE32-NEXT: v_writelane_b32 v40, s31, 1 ; WAVE32-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-NEXT: v_readlane_b32 s31, v40, 1 ; WAVE32-NEXT: v_readlane_b32 s30, v40, 0 +; WAVE32-NEXT: v_readlane_b32 s31, v40, 1 ; WAVE32-NEXT: s_mov_b32 s32, s33 ; WAVE32-NEXT: v_readlane_b32 s4, v40, 2 ; WAVE32-NEXT: s_or_saveexec_b32 s5, -1 diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll index a0c25b2a0beb3..705d403764503 100644 --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -489,22 +489,20 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: v_writelane_b32 v41, s16, 16 ; CHECK-NEXT: .cfi_llvm_vector_registers 65, 2601, 16, 32 ; CHECK-NEXT: .cfi_def_cfa_register 65 -; CHECK-NEXT: v_writelane_b32 v41, s30, 0 -; CHECK-NEXT: v_writelane_b32 v41, s31, 1 -; CHECK-NEXT: v_writelane_b32 v41, s34, 2 -; CHECK-NEXT: v_writelane_b32 v41, s35, 3 -; CHECK-NEXT: v_writelane_b32 v41, s36, 4 -; CHECK-NEXT: v_writelane_b32 v41, s37, 5 -; CHECK-NEXT: v_writelane_b32 v41, s38, 6 -; CHECK-NEXT: v_writelane_b32 v41, s39, 7 -; CHECK-NEXT: v_writelane_b32 v41, s48, 8 -; CHECK-NEXT: v_writelane_b32 v41, s49, 9 -; CHECK-NEXT: v_writelane_b32 v41, s50, 10 -; CHECK-NEXT: v_writelane_b32 v41, s51, 11 -; CHECK-NEXT: v_writelane_b32 v41, s52, 12 +; CHECK-NEXT: v_writelane_b32 v41, s34, 0 +; CHECK-NEXT: v_writelane_b32 v41, s35, 1 +; CHECK-NEXT: v_writelane_b32 v41, s36, 2 +; CHECK-NEXT: v_writelane_b32 v41, s37, 3 +; CHECK-NEXT: v_writelane_b32 v41, s38, 4 +; CHECK-NEXT: v_writelane_b32 v41, s39, 5 +; CHECK-NEXT: v_writelane_b32 v41, s48, 6 +; CHECK-NEXT: v_writelane_b32 v41, s49, 7 +; CHECK-NEXT: v_writelane_b32 v41, s50, 8 +; CHECK-NEXT: v_writelane_b32 v41, s51, 9 +; CHECK-NEXT: v_writelane_b32 v41, s52, 10 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v41, s53, 13 -; CHECK-NEXT: v_writelane_b32 v41, s54, 14 +; CHECK-NEXT: v_writelane_b32 v41, s53, 11 +; CHECK-NEXT: v_writelane_b32 v41, s54, 12 ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: @@ -512,10 +510,12 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: v_writelane_b32 v41, s55, 15 +; CHECK-NEXT: v_writelane_b32 v41, s55, 13 ; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v41, s30, 14 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_writelane_b32 v41, s31, 15 ; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b32 s50, s15 ; CHECK-NEXT: s_mov_b32 s51, s14 @@ -541,23 +541,23 @@ define weak_odr void @test(i32 %0) !dbg !34 { ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s30, v41, 14 ; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_readlane_b32 s55, v41, 15 -; CHECK-NEXT: v_readlane_b32 s54, v41, 14 -; CHECK-NEXT: v_readlane_b32 s53, v41, 13 -; CHECK-NEXT: v_readlane_b32 s52, v41, 12 -; CHECK-NEXT: v_readlane_b32 s51, v41, 11 -; CHECK-NEXT: v_readlane_b32 s50, v41, 10 -; CHECK-NEXT: v_readlane_b32 s49, v41, 9 -; CHECK-NEXT: v_readlane_b32 s48, v41, 8 -; CHECK-NEXT: v_readlane_b32 s39, v41, 7 -; CHECK-NEXT: v_readlane_b32 s38, v41, 6 -; CHECK-NEXT: v_readlane_b32 s37, v41, 5 -; CHECK-NEXT: v_readlane_b32 s36, v41, 4 -; CHECK-NEXT: v_readlane_b32 s35, v41, 3 -; CHECK-NEXT: v_readlane_b32 s34, v41, 2 -; CHECK-NEXT: v_readlane_b32 s31, v41, 1 -; CHECK-NEXT: v_readlane_b32 s30, v41, 0 +; CHECK-NEXT: v_readlane_b32 s31, v41, 15 +; CHECK-NEXT: v_readlane_b32 s55, v41, 13 +; CHECK-NEXT: v_readlane_b32 s54, v41, 12 +; CHECK-NEXT: v_readlane_b32 s53, v41, 11 +; CHECK-NEXT: v_readlane_b32 s52, v41, 10 +; CHECK-NEXT: v_readlane_b32 s51, v41, 9 +; CHECK-NEXT: v_readlane_b32 s50, v41, 8 +; CHECK-NEXT: v_readlane_b32 s49, v41, 7 +; CHECK-NEXT: v_readlane_b32 s48, v41, 6 +; CHECK-NEXT: v_readlane_b32 s39, v41, 5 +; CHECK-NEXT: v_readlane_b32 s38, v41, 4 +; CHECK-NEXT: v_readlane_b32 s37, v41, 3 +; CHECK-NEXT: v_readlane_b32 s36, v41, 2 +; CHECK-NEXT: v_readlane_b32 s35, v41, 1 +; CHECK-NEXT: v_readlane_b32 s34, v41, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v41, 16 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll index bcccf50e3805c..db48100aa7caf 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll @@ -299,8 +299,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 { ; CHECK-TRUE16-NEXT: s_wait_alu 0xfffe ; CHECK-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; CHECK-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-TRUE16-NEXT: s_mov_b32 s32, s33 ; CHECK-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; CHECK-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -339,8 +339,8 @@ define amdgpu_gfx void @amdgpu_gfx() #0 { ; CHECK-FAKE16-NEXT: s_wait_alu 0xfffe ; CHECK-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; CHECK-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-FAKE16-NEXT: s_mov_b32 s32, s33 ; CHECK-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; CHECK-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll index 76a2114a000cf..cba5aa8ef3672 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -55,8 +55,8 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s31, v42, 1 ; GCN-NEXT: v_readlane_b32 s30, v42, 0 +; GCN-NEXT: v_readlane_b32 s31, v42, 1 ; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: v_readlane_b32 s4, v42, 2 ; GCN-NEXT: v_readlane_b32 s34, v42, 3 diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll index 2e88da142bb41..6abe5998d6767 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -26,8 +26,8 @@ define void @callee_with_stack_and_call() #0 { ; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s32, s33 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -46,21 +46,14 @@ define void @callee_with_stack_and_call() #0 { ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; NO-SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x800 ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s30, 0 +; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 1 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: v_writelane_b32 v0, s31, 0 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 ; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -69,20 +62,12 @@ define void @callee_with_stack_and_call() #0 { ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 0 -; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec -; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 +; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 3 ; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v0, 0 +; NO-SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v0, 1 ; NO-SPILL-TO-VGPR-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll index 0f535c3a3bdbc..6dd683e6fca53 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll @@ -1740,8 +1740,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -1770,8 +1770,8 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2159,8 +2159,8 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -2201,8 +2201,8 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll index 9d137fb4101e4..a2f203a111e18 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -39,47 +39,47 @@ define amdgpu_gfx void @gfx_func() { ; SDAG-NEXT: v_writelane_b32 v40, s27, 23 ; SDAG-NEXT: v_writelane_b32 v40, s28, 24 ; SDAG-NEXT: v_writelane_b32 v40, s29, 25 -; SDAG-NEXT: v_writelane_b32 v40, s30, 26 -; SDAG-NEXT: v_writelane_b32 v40, s31, 27 -; SDAG-NEXT: v_writelane_b32 v40, s72, 28 -; SDAG-NEXT: v_writelane_b32 v40, s73, 29 -; SDAG-NEXT: v_writelane_b32 v40, s74, 30 -; SDAG-NEXT: v_writelane_b32 v40, s75, 31 -; SDAG-NEXT: v_writelane_b32 v40, s76, 32 -; SDAG-NEXT: v_writelane_b32 v40, s77, 33 -; SDAG-NEXT: v_writelane_b32 v40, s78, 34 -; SDAG-NEXT: v_writelane_b32 v40, s79, 35 -; SDAG-NEXT: v_writelane_b32 v40, s88, 36 -; SDAG-NEXT: v_writelane_b32 v40, s89, 37 -; SDAG-NEXT: v_writelane_b32 v40, s90, 38 -; SDAG-NEXT: v_writelane_b32 v40, s91, 39 -; SDAG-NEXT: v_writelane_b32 v40, s92, 40 -; SDAG-NEXT: v_writelane_b32 v40, s93, 41 -; SDAG-NEXT: v_writelane_b32 v40, s94, 42 +; SDAG-NEXT: v_writelane_b32 v40, s72, 26 +; SDAG-NEXT: v_writelane_b32 v40, s73, 27 +; SDAG-NEXT: v_writelane_b32 v40, s74, 28 +; SDAG-NEXT: v_writelane_b32 v40, s75, 29 +; SDAG-NEXT: v_writelane_b32 v40, s76, 30 +; SDAG-NEXT: v_writelane_b32 v40, s77, 31 +; SDAG-NEXT: v_writelane_b32 v40, s78, 32 +; SDAG-NEXT: v_writelane_b32 v40, s79, 33 +; SDAG-NEXT: v_writelane_b32 v40, s88, 34 +; SDAG-NEXT: v_writelane_b32 v40, s89, 35 +; SDAG-NEXT: v_writelane_b32 v40, s90, 36 +; SDAG-NEXT: v_writelane_b32 v40, s91, 37 +; SDAG-NEXT: v_writelane_b32 v40, s92, 38 +; SDAG-NEXT: v_writelane_b32 v40, s93, 39 +; SDAG-NEXT: v_writelane_b32 v40, s94, 40 +; SDAG-NEXT: v_writelane_b32 v40, s95, 41 +; SDAG-NEXT: v_writelane_b32 v40, s30, 42 ; SDAG-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi ; SDAG-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo ; SDAG-NEXT: s_mov_b64 s[8:9], 0 ; SDAG-NEXT: s_addk_i32 s32, 0x400 -; SDAG-NEXT: v_writelane_b32 v40, s95, 43 +; SDAG-NEXT: v_writelane_b32 v40, s31, 43 ; SDAG-NEXT: s_swappc_b64 s[30:31], s[34:35] -; SDAG-NEXT: v_readlane_b32 s95, v40, 43 -; SDAG-NEXT: v_readlane_b32 s94, v40, 42 -; SDAG-NEXT: v_readlane_b32 s93, v40, 41 -; SDAG-NEXT: v_readlane_b32 s92, v40, 40 -; SDAG-NEXT: v_readlane_b32 s91, v40, 39 -; SDAG-NEXT: v_readlane_b32 s90, v40, 38 -; SDAG-NEXT: v_readlane_b32 s89, v40, 37 -; SDAG-NEXT: v_readlane_b32 s88, v40, 36 -; SDAG-NEXT: v_readlane_b32 s79, v40, 35 -; SDAG-NEXT: v_readlane_b32 s78, v40, 34 -; SDAG-NEXT: v_readlane_b32 s77, v40, 33 -; SDAG-NEXT: v_readlane_b32 s76, v40, 32 -; SDAG-NEXT: v_readlane_b32 s75, v40, 31 -; SDAG-NEXT: v_readlane_b32 s74, v40, 30 -; SDAG-NEXT: v_readlane_b32 s73, v40, 29 -; SDAG-NEXT: v_readlane_b32 s72, v40, 28 -; SDAG-NEXT: v_readlane_b32 s31, v40, 27 -; SDAG-NEXT: v_readlane_b32 s30, v40, 26 +; SDAG-NEXT: v_readlane_b32 s30, v40, 42 +; SDAG-NEXT: v_readlane_b32 s31, v40, 43 +; SDAG-NEXT: v_readlane_b32 s95, v40, 41 +; SDAG-NEXT: v_readlane_b32 s94, v40, 40 +; SDAG-NEXT: v_readlane_b32 s93, v40, 39 +; SDAG-NEXT: v_readlane_b32 s92, v40, 38 +; SDAG-NEXT: v_readlane_b32 s91, v40, 37 +; SDAG-NEXT: v_readlane_b32 s90, v40, 36 +; SDAG-NEXT: v_readlane_b32 s89, v40, 35 +; SDAG-NEXT: v_readlane_b32 s88, v40, 34 +; SDAG-NEXT: v_readlane_b32 s79, v40, 33 +; SDAG-NEXT: v_readlane_b32 s78, v40, 32 +; SDAG-NEXT: v_readlane_b32 s77, v40, 31 +; SDAG-NEXT: v_readlane_b32 s76, v40, 30 +; SDAG-NEXT: v_readlane_b32 s75, v40, 29 +; SDAG-NEXT: v_readlane_b32 s74, v40, 28 +; SDAG-NEXT: v_readlane_b32 s73, v40, 27 +; SDAG-NEXT: v_readlane_b32 s72, v40, 26 ; SDAG-NEXT: v_readlane_b32 s29, v40, 25 ; SDAG-NEXT: v_readlane_b32 s28, v40, 24 ; SDAG-NEXT: v_readlane_b32 s27, v40, 23 @@ -148,47 +148,47 @@ define amdgpu_gfx void @gfx_func() { ; GISEL-NEXT: v_writelane_b32 v40, s27, 23 ; GISEL-NEXT: v_writelane_b32 v40, s28, 24 ; GISEL-NEXT: v_writelane_b32 v40, s29, 25 -; GISEL-NEXT: v_writelane_b32 v40, s30, 26 -; GISEL-NEXT: v_writelane_b32 v40, s31, 27 -; GISEL-NEXT: v_writelane_b32 v40, s72, 28 -; GISEL-NEXT: v_writelane_b32 v40, s73, 29 -; GISEL-NEXT: v_writelane_b32 v40, s74, 30 -; GISEL-NEXT: v_writelane_b32 v40, s75, 31 -; GISEL-NEXT: v_writelane_b32 v40, s76, 32 -; GISEL-NEXT: v_writelane_b32 v40, s77, 33 -; GISEL-NEXT: v_writelane_b32 v40, s78, 34 -; GISEL-NEXT: v_writelane_b32 v40, s79, 35 -; GISEL-NEXT: v_writelane_b32 v40, s88, 36 -; GISEL-NEXT: v_writelane_b32 v40, s89, 37 -; GISEL-NEXT: v_writelane_b32 v40, s90, 38 -; GISEL-NEXT: v_writelane_b32 v40, s91, 39 -; GISEL-NEXT: v_writelane_b32 v40, s92, 40 -; GISEL-NEXT: v_writelane_b32 v40, s93, 41 -; GISEL-NEXT: v_writelane_b32 v40, s94, 42 +; GISEL-NEXT: v_writelane_b32 v40, s72, 26 +; GISEL-NEXT: v_writelane_b32 v40, s73, 27 +; GISEL-NEXT: v_writelane_b32 v40, s74, 28 +; GISEL-NEXT: v_writelane_b32 v40, s75, 29 +; GISEL-NEXT: v_writelane_b32 v40, s76, 30 +; GISEL-NEXT: v_writelane_b32 v40, s77, 31 +; GISEL-NEXT: v_writelane_b32 v40, s78, 32 +; GISEL-NEXT: v_writelane_b32 v40, s79, 33 +; GISEL-NEXT: v_writelane_b32 v40, s88, 34 +; GISEL-NEXT: v_writelane_b32 v40, s89, 35 +; GISEL-NEXT: v_writelane_b32 v40, s90, 36 +; GISEL-NEXT: v_writelane_b32 v40, s91, 37 +; GISEL-NEXT: v_writelane_b32 v40, s92, 38 +; GISEL-NEXT: v_writelane_b32 v40, s93, 39 +; GISEL-NEXT: v_writelane_b32 v40, s94, 40 +; GISEL-NEXT: v_writelane_b32 v40, s95, 41 +; GISEL-NEXT: v_writelane_b32 v40, s30, 42 ; GISEL-NEXT: s_mov_b32 s34, extern_c_func@abs32@lo ; GISEL-NEXT: s_mov_b32 s35, extern_c_func@abs32@hi ; GISEL-NEXT: s_mov_b64 s[8:9], 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s95, 43 +; GISEL-NEXT: v_writelane_b32 v40, s31, 43 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GISEL-NEXT: v_readlane_b32 s95, v40, 43 -; GISEL-NEXT: v_readlane_b32 s94, v40, 42 -; GISEL-NEXT: v_readlane_b32 s93, v40, 41 -; GISEL-NEXT: v_readlane_b32 s92, v40, 40 -; GISEL-NEXT: v_readlane_b32 s91, v40, 39 -; GISEL-NEXT: v_readlane_b32 s90, v40, 38 -; GISEL-NEXT: v_readlane_b32 s89, v40, 37 -; GISEL-NEXT: v_readlane_b32 s88, v40, 36 -; GISEL-NEXT: v_readlane_b32 s79, v40, 35 -; GISEL-NEXT: v_readlane_b32 s78, v40, 34 -; GISEL-NEXT: v_readlane_b32 s77, v40, 33 -; GISEL-NEXT: v_readlane_b32 s76, v40, 32 -; GISEL-NEXT: v_readlane_b32 s75, v40, 31 -; GISEL-NEXT: v_readlane_b32 s74, v40, 30 -; GISEL-NEXT: v_readlane_b32 s73, v40, 29 -; GISEL-NEXT: v_readlane_b32 s72, v40, 28 -; GISEL-NEXT: v_readlane_b32 s31, v40, 27 -; GISEL-NEXT: v_readlane_b32 s30, v40, 26 +; GISEL-NEXT: v_readlane_b32 s30, v40, 42 +; GISEL-NEXT: v_readlane_b32 s31, v40, 43 +; GISEL-NEXT: v_readlane_b32 s95, v40, 41 +; GISEL-NEXT: v_readlane_b32 s94, v40, 40 +; GISEL-NEXT: v_readlane_b32 s93, v40, 39 +; GISEL-NEXT: v_readlane_b32 s92, v40, 38 +; GISEL-NEXT: v_readlane_b32 s91, v40, 37 +; GISEL-NEXT: v_readlane_b32 s90, v40, 36 +; GISEL-NEXT: v_readlane_b32 s89, v40, 35 +; GISEL-NEXT: v_readlane_b32 s88, v40, 34 +; GISEL-NEXT: v_readlane_b32 s79, v40, 33 +; GISEL-NEXT: v_readlane_b32 s78, v40, 32 +; GISEL-NEXT: v_readlane_b32 s77, v40, 31 +; GISEL-NEXT: v_readlane_b32 s76, v40, 30 +; GISEL-NEXT: v_readlane_b32 s75, v40, 29 +; GISEL-NEXT: v_readlane_b32 s74, v40, 28 +; GISEL-NEXT: v_readlane_b32 s73, v40, 27 +; GISEL-NEXT: v_readlane_b32 s72, v40, 26 ; GISEL-NEXT: v_readlane_b32 s29, v40, 25 ; GISEL-NEXT: v_readlane_b32 s28, v40, 24 ; GISEL-NEXT: v_readlane_b32 s27, v40, 23 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 234eaa8af7edf..39231558934a7 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -141,8 +141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -170,8 +170,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -200,8 +200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -229,8 +229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -264,8 +264,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -295,8 +295,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -326,8 +326,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -357,8 +357,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -393,8 +393,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -424,8 +424,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -455,8 +455,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -486,8 +486,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -519,8 +519,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -547,8 +547,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -576,8 +576,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -604,8 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -632,8 +632,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -665,8 +665,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -694,8 +694,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -724,8 +724,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -753,8 +753,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -782,8 +782,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -816,8 +816,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -845,8 +845,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -875,8 +875,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -933,8 +933,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -966,8 +966,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -994,8 +994,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1023,8 +1023,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -1051,8 +1051,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -1079,8 +1079,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1112,8 +1112,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1141,8 +1141,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1171,8 +1171,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -1200,8 +1200,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -1229,8 +1229,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1263,8 +1263,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1292,8 +1292,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1322,8 +1322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -1351,8 +1351,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -1380,8 +1380,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1413,8 +1413,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1441,8 +1441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1470,8 +1470,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -1498,8 +1498,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1531,8 +1531,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1560,8 +1560,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1589,8 +1589,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -1618,8 +1618,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1652,8 +1652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1682,8 +1682,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1713,8 +1713,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -1743,8 +1743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1779,8 +1779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1810,8 +1810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1840,8 +1840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -1871,8 +1871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -1907,8 +1907,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -1939,8 +1939,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -1970,8 +1970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2002,8 +2002,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2043,8 +2043,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2077,8 +2077,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2109,8 +2109,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2143,8 +2143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2177,8 +2177,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2205,8 +2205,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2234,8 +2234,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -2262,8 +2262,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -2290,8 +2290,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2322,8 +2322,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2350,8 +2350,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2379,8 +2379,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2407,8 +2407,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2440,8 +2440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2469,8 +2469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2498,8 +2498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2527,8 +2527,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2561,8 +2561,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2591,8 +2591,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2621,8 +2621,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2651,8 +2651,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2687,8 +2687,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2719,8 +2719,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2750,8 +2750,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2782,8 +2782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2815,8 +2815,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2844,8 +2844,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2873,8 +2873,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -2902,8 +2902,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -2937,8 +2937,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -2968,8 +2968,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -2998,8 +2998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -3029,8 +3029,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3066,8 +3066,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -3099,8 +3099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -3130,8 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -3163,8 +3163,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3200,8 +3200,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -3233,8 +3233,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -3267,8 +3267,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -3300,8 +3300,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -3333,8 +3333,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3371,8 +3371,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -3404,8 +3404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -3437,8 +3437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -3470,8 +3470,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 { ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3509,8 +3509,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -3543,8 +3543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -3577,8 +3577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -3611,8 +3611,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 { ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3652,8 +3652,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v4, v6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -3688,8 +3688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: v_mov_b32_e32 v4, v6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -3724,8 +3724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: v_mov_b32_e32 v4, v6 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -3760,8 +3760,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v6 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3804,8 +3804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX9-NEXT: v_mov_b32_e32 v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v8 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -3843,8 +3843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX10-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -3881,8 +3881,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -3920,8 +3920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, v1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, v8 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -3996,8 +3996,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX9-NEXT: v_mov_b32_e32 v18, v33 ; GFX9-NEXT: v_mov_b32_e32 v19, v34 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -4068,8 +4068,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX10-NEXT: v_mov_b32_e32 v18, v33 ; GFX10-NEXT: v_mov_b32_e32 v19, v34 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -4135,8 +4135,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX11-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33 ; GFX11-NEXT: v_mov_b32_e32 v19, v34 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -4207,8 +4207,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v18, v33 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v19, v34 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -4249,8 +4249,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -4285,8 +4285,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -4322,8 +4322,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -4358,8 +4358,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -4394,8 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -4442,8 +4442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -4478,8 +4478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4522,8 +4522,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 @@ -4566,8 +4566,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4607,8 +4607,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4663,8 +4663,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -4701,8 +4701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_mov_b32_e32 v3, 2 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 @@ -4743,20 +4743,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b8 v[3:4], v2, off ; GFX11-TRUE16-NEXT: global_store_b16 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -4795,8 +4795,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2 ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v42, 2 @@ -4840,8 +4840,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 @@ -4898,8 +4898,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -4936,8 +4936,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -4985,11 +4985,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 @@ -5033,8 +5033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 @@ -5082,8 +5082,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5146,8 +5146,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -5186,8 +5186,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5242,7 +5242,7 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 4 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b8 v[2:3], v4, off @@ -5251,7 +5251,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -5293,8 +5292,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-FAKE16-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 @@ -5347,8 +5346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -5419,8 +5418,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -5464,12 +5463,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-NEXT: v_lshlrev_b16 v7, 8, v7 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 @@ -5527,14 +5526,13 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v2.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l ; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[3:4], off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -5585,12 +5583,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5 ; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v2, v3 @@ -5642,12 +5640,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v7, 8, v7 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v1, 8, v1 ; GFX10-SCRATCH-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 @@ -5781,8 +5779,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -5904,8 +5902,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6032,8 +6030,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:12 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -6183,8 +6181,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 ; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s33 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s33 offset:12 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -6306,8 +6304,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:8 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v44, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -6342,8 +6340,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -6370,8 +6368,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6399,8 +6397,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -6427,8 +6425,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -6460,8 +6458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -6488,8 +6486,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6517,8 +6515,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -6545,8 +6543,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -6578,8 +6576,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -6606,8 +6604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6635,8 +6633,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -6663,8 +6661,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -6697,8 +6695,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -6726,8 +6724,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6755,8 +6753,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -6784,8 +6782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -6817,8 +6815,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -6846,8 +6844,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6876,8 +6874,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -6905,8 +6903,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -6937,8 +6935,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -6965,8 +6963,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -6994,8 +6992,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7022,8 +7020,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7056,8 +7054,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7085,8 +7083,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7115,8 +7113,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7144,8 +7142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7176,8 +7174,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7204,8 +7202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7233,8 +7231,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7261,8 +7259,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7294,8 +7292,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7322,8 +7320,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7351,8 +7349,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7379,8 +7377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7413,8 +7411,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7442,8 +7440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7471,8 +7469,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7500,8 +7498,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7534,8 +7532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7564,8 +7562,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7594,8 +7592,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7624,8 +7622,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7659,8 +7657,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7690,8 +7688,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7720,8 +7718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7751,8 +7749,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7783,8 +7781,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7811,8 +7809,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7840,8 +7838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7868,8 +7866,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -7904,8 +7902,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -7935,8 +7933,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -7965,8 +7963,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -7996,8 +7994,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8032,8 +8030,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8064,8 +8062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -8095,8 +8093,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8127,8 +8125,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8163,8 +8161,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v8i32@abs32@lo ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8196,8 +8194,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v8i32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -8230,8 +8228,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8263,8 +8261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v8i32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8304,8 +8302,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8339,8 +8337,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -8371,8 +8369,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8406,8 +8404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8444,8 +8442,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v16i32@abs32@lo ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8479,8 +8477,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v16i32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -8515,8 +8513,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8550,8 +8548,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v16i32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8595,8 +8593,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_mov_b32 s34, external_void_func_v32i32@abs32@lo ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8634,8 +8632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_v32i32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -8674,8 +8672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8713,8 +8711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v32i32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8761,8 +8759,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8803,8 +8801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -8845,8 +8843,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -8887,8 +8885,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -8930,8 +8928,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: v_readlane_b32 s30, v42, 0 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v42, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -8967,8 +8965,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v42, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -9004,8 +9002,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -9041,8 +9039,8 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v42, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -9078,8 +9076,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32@abs32@lo ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -9111,8 +9109,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_struct_i8_i32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -9145,8 +9143,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -9178,8 +9176,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -9211,8 +9209,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_struct_i8_i32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -9249,8 +9247,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -9281,8 +9279,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -9315,8 +9313,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX11-TRUE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -9347,8 +9345,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX11-FAKE16-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_or_saveexec_b32 s1, -1 @@ -9379,8 +9377,8 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -9424,8 +9422,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -9466,8 +9464,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -9508,8 +9506,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v0, off, s33 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v1, off, s33 offset:12 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -9548,8 +9546,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: scratch_load_u8 v0, off, s33 offset:8 ; GFX11-FAKE16-NEXT: scratch_load_b32 v1, off, s33 offset:12 -; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-FAKE16-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s32, s33 ; GFX11-FAKE16-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) @@ -9590,8 +9588,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_ubyte v0, off, s33 offset:8 ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) @@ -9662,8 +9660,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX9-NEXT: v_mov_b32_e32 v2, v17 ; GFX9-NEXT: v_mov_b32_e32 v3, v18 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -9713,8 +9711,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-NEXT: v_mov_b32_e32 v2, v17 ; GFX10-NEXT: v_mov_b32_e32 v3, v18 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -9761,8 +9759,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v1, v16 ; GFX11-NEXT: v_dual_mov_b32 v2, v17 :: v_dual_mov_b32 v3, v18 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -9812,8 +9810,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, v17 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, v18 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -9841,46 +9839,46 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s35, 3 -; GFX9-NEXT: v_writelane_b32 v40, s36, 4 -; GFX9-NEXT: v_writelane_b32 v40, s37, 5 -; GFX9-NEXT: v_writelane_b32 v40, s38, 6 -; GFX9-NEXT: v_writelane_b32 v40, s39, 7 -; GFX9-NEXT: v_writelane_b32 v40, s48, 8 -; GFX9-NEXT: v_writelane_b32 v40, s49, 9 -; GFX9-NEXT: v_writelane_b32 v40, s50, 10 -; GFX9-NEXT: v_writelane_b32 v40, s51, 11 -; GFX9-NEXT: v_writelane_b32 v40, s52, 12 -; GFX9-NEXT: v_writelane_b32 v40, s53, 13 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s36, 2 +; GFX9-NEXT: v_writelane_b32 v40, s37, 3 +; GFX9-NEXT: v_writelane_b32 v40, s38, 4 +; GFX9-NEXT: v_writelane_b32 v40, s39, 5 +; GFX9-NEXT: v_writelane_b32 v40, s48, 6 +; GFX9-NEXT: v_writelane_b32 v40, s49, 7 +; GFX9-NEXT: v_writelane_b32 v40, s50, 8 +; GFX9-NEXT: v_writelane_b32 v40, s51, 9 +; GFX9-NEXT: v_writelane_b32 v40, s52, 10 +; GFX9-NEXT: v_writelane_b32 v40, s53, 11 +; GFX9-NEXT: v_writelane_b32 v40, s54, 12 +; GFX9-NEXT: v_writelane_b32 v40, s55, 13 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s54, 14 +; GFX9-NEXT: v_writelane_b32 v40, s30, 14 ; GFX9-NEXT: s_mov_b32 s5, byval_align16_f64_arg@abs32@hi ; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg@abs32@lo -; GFX9-NEXT: v_writelane_b32 v40, s55, 15 +; GFX9-NEXT: v_writelane_b32 v40, s31, 15 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s55, v40, 15 -; GFX9-NEXT: v_readlane_b32 s54, v40, 14 -; GFX9-NEXT: v_readlane_b32 s53, v40, 13 -; GFX9-NEXT: v_readlane_b32 s52, v40, 12 -; GFX9-NEXT: v_readlane_b32 s51, v40, 11 -; GFX9-NEXT: v_readlane_b32 s50, v40, 10 -; GFX9-NEXT: v_readlane_b32 s49, v40, 9 -; GFX9-NEXT: v_readlane_b32 s48, v40, 8 -; GFX9-NEXT: v_readlane_b32 s39, v40, 7 -; GFX9-NEXT: v_readlane_b32 s38, v40, 6 -; GFX9-NEXT: v_readlane_b32 s37, v40, 5 -; GFX9-NEXT: v_readlane_b32 s36, v40, 4 -; GFX9-NEXT: v_readlane_b32 s35, v40, 3 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s30, v40, 14 +; GFX9-NEXT: v_readlane_b32 s31, v40, 15 +; GFX9-NEXT: v_readlane_b32 s55, v40, 13 +; GFX9-NEXT: v_readlane_b32 s54, v40, 12 +; GFX9-NEXT: v_readlane_b32 s53, v40, 11 +; GFX9-NEXT: v_readlane_b32 s52, v40, 10 +; GFX9-NEXT: v_readlane_b32 s51, v40, 9 +; GFX9-NEXT: v_readlane_b32 s50, v40, 8 +; GFX9-NEXT: v_readlane_b32 s49, v40, 7 +; GFX9-NEXT: v_readlane_b32 s48, v40, 6 +; GFX9-NEXT: v_readlane_b32 s39, v40, 5 +; GFX9-NEXT: v_readlane_b32 s38, v40, 4 +; GFX9-NEXT: v_readlane_b32 s37, v40, 3 +; GFX9-NEXT: v_readlane_b32 s36, v40, 2 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload @@ -9902,7 +9900,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg@abs32@hi ; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg@abs32@lo @@ -9910,38 +9908,38 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: v_writelane_b32 v40, s35, 3 -; GFX10-NEXT: v_writelane_b32 v40, s36, 4 -; GFX10-NEXT: v_writelane_b32 v40, s37, 5 -; GFX10-NEXT: v_writelane_b32 v40, s38, 6 -; GFX10-NEXT: v_writelane_b32 v40, s39, 7 -; GFX10-NEXT: v_writelane_b32 v40, s48, 8 -; GFX10-NEXT: v_writelane_b32 v40, s49, 9 -; GFX10-NEXT: v_writelane_b32 v40, s50, 10 -; GFX10-NEXT: v_writelane_b32 v40, s51, 11 -; GFX10-NEXT: v_writelane_b32 v40, s52, 12 -; GFX10-NEXT: v_writelane_b32 v40, s53, 13 -; GFX10-NEXT: v_writelane_b32 v40, s54, 14 -; GFX10-NEXT: v_writelane_b32 v40, s55, 15 +; GFX10-NEXT: v_writelane_b32 v40, s35, 1 +; GFX10-NEXT: v_writelane_b32 v40, s36, 2 +; GFX10-NEXT: v_writelane_b32 v40, s37, 3 +; GFX10-NEXT: v_writelane_b32 v40, s38, 4 +; GFX10-NEXT: v_writelane_b32 v40, s39, 5 +; GFX10-NEXT: v_writelane_b32 v40, s48, 6 +; GFX10-NEXT: v_writelane_b32 v40, s49, 7 +; GFX10-NEXT: v_writelane_b32 v40, s50, 8 +; GFX10-NEXT: v_writelane_b32 v40, s51, 9 +; GFX10-NEXT: v_writelane_b32 v40, s52, 10 +; GFX10-NEXT: v_writelane_b32 v40, s53, 11 +; GFX10-NEXT: v_writelane_b32 v40, s54, 12 +; GFX10-NEXT: v_writelane_b32 v40, s55, 13 +; GFX10-NEXT: v_writelane_b32 v40, s30, 14 +; GFX10-NEXT: v_writelane_b32 v40, s31, 15 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s55, v40, 15 -; GFX10-NEXT: v_readlane_b32 s54, v40, 14 -; GFX10-NEXT: v_readlane_b32 s53, v40, 13 -; GFX10-NEXT: v_readlane_b32 s52, v40, 12 -; GFX10-NEXT: v_readlane_b32 s51, v40, 11 -; GFX10-NEXT: v_readlane_b32 s50, v40, 10 -; GFX10-NEXT: v_readlane_b32 s49, v40, 9 -; GFX10-NEXT: v_readlane_b32 s48, v40, 8 -; GFX10-NEXT: v_readlane_b32 s39, v40, 7 -; GFX10-NEXT: v_readlane_b32 s38, v40, 6 -; GFX10-NEXT: v_readlane_b32 s37, v40, 5 -; GFX10-NEXT: v_readlane_b32 s36, v40, 4 -; GFX10-NEXT: v_readlane_b32 s35, v40, 3 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s30, v40, 14 +; GFX10-NEXT: v_readlane_b32 s31, v40, 15 +; GFX10-NEXT: v_readlane_b32 s55, v40, 13 +; GFX10-NEXT: v_readlane_b32 s54, v40, 12 +; GFX10-NEXT: v_readlane_b32 s53, v40, 11 +; GFX10-NEXT: v_readlane_b32 s52, v40, 10 +; GFX10-NEXT: v_readlane_b32 s51, v40, 9 +; GFX10-NEXT: v_readlane_b32 s50, v40, 8 +; GFX10-NEXT: v_readlane_b32 s49, v40, 7 +; GFX10-NEXT: v_readlane_b32 s48, v40, 6 +; GFX10-NEXT: v_readlane_b32 s39, v40, 5 +; GFX10-NEXT: v_readlane_b32 s38, v40, 4 +; GFX10-NEXT: v_readlane_b32 s37, v40, 3 +; GFX10-NEXT: v_readlane_b32 s36, v40, 2 +; GFX10-NEXT: v_readlane_b32 s35, v40, 1 +; GFX10-NEXT: v_readlane_b32 s34, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload @@ -9962,44 +9960,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v31, off, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s34, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: s_mov_b32 s1, byval_align16_f64_arg@abs32@hi ; GFX11-NEXT: s_mov_b32 s0, byval_align16_f64_arg@abs32@lo -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: v_writelane_b32 v40, s34, 2 -; GFX11-NEXT: v_writelane_b32 v40, s35, 3 -; GFX11-NEXT: v_writelane_b32 v40, s36, 4 -; GFX11-NEXT: v_writelane_b32 v40, s37, 5 -; GFX11-NEXT: v_writelane_b32 v40, s38, 6 -; GFX11-NEXT: v_writelane_b32 v40, s39, 7 -; GFX11-NEXT: v_writelane_b32 v40, s48, 8 -; GFX11-NEXT: v_writelane_b32 v40, s49, 9 -; GFX11-NEXT: v_writelane_b32 v40, s50, 10 -; GFX11-NEXT: v_writelane_b32 v40, s51, 11 -; GFX11-NEXT: v_writelane_b32 v40, s52, 12 -; GFX11-NEXT: v_writelane_b32 v40, s53, 13 -; GFX11-NEXT: v_writelane_b32 v40, s54, 14 -; GFX11-NEXT: v_writelane_b32 v40, s55, 15 +; GFX11-NEXT: v_writelane_b32 v40, s35, 1 +; GFX11-NEXT: v_writelane_b32 v40, s36, 2 +; GFX11-NEXT: v_writelane_b32 v40, s37, 3 +; GFX11-NEXT: v_writelane_b32 v40, s38, 4 +; GFX11-NEXT: v_writelane_b32 v40, s39, 5 +; GFX11-NEXT: v_writelane_b32 v40, s48, 6 +; GFX11-NEXT: v_writelane_b32 v40, s49, 7 +; GFX11-NEXT: v_writelane_b32 v40, s50, 8 +; GFX11-NEXT: v_writelane_b32 v40, s51, 9 +; GFX11-NEXT: v_writelane_b32 v40, s52, 10 +; GFX11-NEXT: v_writelane_b32 v40, s53, 11 +; GFX11-NEXT: v_writelane_b32 v40, s54, 12 +; GFX11-NEXT: v_writelane_b32 v40, s55, 13 +; GFX11-NEXT: v_writelane_b32 v40, s30, 14 +; GFX11-NEXT: v_writelane_b32 v40, s31, 15 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s55, v40, 15 -; GFX11-NEXT: v_readlane_b32 s54, v40, 14 -; GFX11-NEXT: v_readlane_b32 s53, v40, 13 -; GFX11-NEXT: v_readlane_b32 s52, v40, 12 -; GFX11-NEXT: v_readlane_b32 s51, v40, 11 -; GFX11-NEXT: v_readlane_b32 s50, v40, 10 -; GFX11-NEXT: v_readlane_b32 s49, v40, 9 -; GFX11-NEXT: v_readlane_b32 s48, v40, 8 -; GFX11-NEXT: v_readlane_b32 s39, v40, 7 -; GFX11-NEXT: v_readlane_b32 s38, v40, 6 -; GFX11-NEXT: v_readlane_b32 s37, v40, 5 -; GFX11-NEXT: v_readlane_b32 s36, v40, 4 -; GFX11-NEXT: v_readlane_b32 s35, v40, 3 -; GFX11-NEXT: v_readlane_b32 s34, v40, 2 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s30, v40, 14 +; GFX11-NEXT: v_readlane_b32 s31, v40, 15 +; GFX11-NEXT: v_readlane_b32 s55, v40, 13 +; GFX11-NEXT: v_readlane_b32 s54, v40, 12 +; GFX11-NEXT: v_readlane_b32 s53, v40, 11 +; GFX11-NEXT: v_readlane_b32 s52, v40, 10 +; GFX11-NEXT: v_readlane_b32 s51, v40, 9 +; GFX11-NEXT: v_readlane_b32 s50, v40, 8 +; GFX11-NEXT: v_readlane_b32 s49, v40, 7 +; GFX11-NEXT: v_readlane_b32 s48, v40, 6 +; GFX11-NEXT: v_readlane_b32 s39, v40, 5 +; GFX11-NEXT: v_readlane_b32 s38, v40, 4 +; GFX11-NEXT: v_readlane_b32 s37, v40, 3 +; GFX11-NEXT: v_readlane_b32 s36, v40, 2 +; GFX11-NEXT: v_readlane_b32 s35, v40, 1 +; GFX11-NEXT: v_readlane_b32 s34, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload @@ -10020,44 +10018,44 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: s_mov_b32 s1, byval_align16_f64_arg@abs32@hi ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, byval_align16_f64_arg@abs32@lo -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 3 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 5 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 7 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 9 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 10 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 13 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 14 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 15 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s35, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s36, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s37, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s38, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s39, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s48, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s49, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s50, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s51, 9 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s52, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s53, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s54, 12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s55, 13 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 14 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 15 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(1) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 15 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 14 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 13 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 12 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 11 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 10 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 9 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 8 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 7 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 6 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 5 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 3 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 2 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 14 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 15 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s55, v40, 13 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s54, v40, 12 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s53, v40, 11 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s52, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s51, v40, 9 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s50, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s49, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s48, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s39, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s38, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s37, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s36, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s35, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s34, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload @@ -10091,8 +10089,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -10120,8 +10118,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -10150,8 +10148,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -10179,8 +10177,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -10212,8 +10210,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -10242,8 +10240,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -10273,8 +10271,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -10303,8 +10301,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -10337,8 +10335,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -10367,8 +10365,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -10398,8 +10396,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -10428,8 +10426,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -10462,8 +10460,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -10492,8 +10490,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -10523,8 +10521,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -10553,8 +10551,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -10589,8 +10587,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -10622,8 +10620,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -10656,8 +10654,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -10689,8 +10687,8 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -10728,8 +10726,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -10765,8 +10763,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -10803,8 +10801,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 5 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 @@ -10840,8 +10838,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 @@ -10884,8 +10882,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -10923,8 +10921,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -10963,8 +10961,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 5 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 @@ -11002,8 +11000,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 @@ -11047,8 +11045,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 7 ; GFX9-NEXT: v_readlane_b32 s30, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 7 ; GFX9-NEXT: v_readlane_b32 s9, v40, 5 ; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 @@ -11090,8 +11088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 6 ; GFX10-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-NEXT: v_readlane_b32 s9, v40, 5 ; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 @@ -11134,8 +11132,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 7 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 7 ; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 ; GFX11-NEXT: v_readlane_b32 s9, v40, 5 ; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 @@ -11177,8 +11175,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 @@ -11231,8 +11229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s11, v40, 7 ; GFX9-NEXT: v_readlane_b32 s10, v40, 6 ; GFX9-NEXT: v_readlane_b32 s9, v40, 5 @@ -11280,8 +11278,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-NEXT: v_readlane_b32 s11, v40, 7 ; GFX10-NEXT: v_readlane_b32 s10, v40, 6 ; GFX10-NEXT: v_readlane_b32 s9, v40, 5 @@ -11330,8 +11328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 9 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 9 ; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 ; GFX11-NEXT: v_readlane_b32 s11, v40, 7 ; GFX11-NEXT: v_readlane_b32 s10, v40, 6 ; GFX11-NEXT: v_readlane_b32 s9, v40, 5 @@ -11379,8 +11377,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 @@ -11422,8 +11420,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -11452,8 +11450,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -11483,8 +11481,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -11513,8 +11511,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -11547,8 +11545,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -11577,8 +11575,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -11608,8 +11606,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -11638,8 +11636,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -11674,8 +11672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -11707,8 +11705,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -11741,8 +11739,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -11774,8 +11772,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -11813,8 +11811,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 4 ; GFX9-NEXT: v_readlane_b32 s30, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 4 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 @@ -11849,8 +11847,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 ; GFX10-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -11886,8 +11884,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 4 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 4 ; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 @@ -11922,8 +11920,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 @@ -11966,8 +11964,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 6 ; GFX9-NEXT: v_readlane_b32 s30, v40, 5 +; GFX9-NEXT: v_readlane_b32 s31, v40, 6 ; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 @@ -12008,8 +12006,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 5 ; GFX10-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 @@ -12051,8 +12049,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 6 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 6 ; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 ; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 @@ -12093,8 +12091,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 @@ -12133,8 +12131,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -12166,8 +12164,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -12200,8 +12198,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -12233,8 +12231,8 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -12274,8 +12272,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -12313,8 +12311,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -12353,8 +12351,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 5 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 @@ -12392,8 +12390,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 @@ -12439,8 +12437,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 7 ; GFX9-NEXT: v_readlane_b32 s30, v40, 6 +; GFX9-NEXT: v_readlane_b32 s31, v40, 7 ; GFX9-NEXT: v_readlane_b32 s9, v40, 5 ; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 @@ -12484,8 +12482,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 6 ; GFX10-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-NEXT: v_readlane_b32 s9, v40, 5 ; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 @@ -12530,8 +12528,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 7 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 7 ; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 ; GFX11-NEXT: v_readlane_b32 s9, v40, 5 ; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 @@ -12575,8 +12573,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 6 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 7 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 @@ -12614,8 +12612,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -12644,8 +12642,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -12675,8 +12673,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -12705,8 +12703,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -12741,8 +12739,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -12773,8 +12771,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -12806,8 +12804,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -12838,8 +12836,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -12875,8 +12873,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -12907,8 +12905,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -12940,8 +12938,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -12972,8 +12970,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13010,8 +13008,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -13043,8 +13041,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -13077,8 +13075,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -13110,8 +13108,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13147,8 +13145,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -13180,8 +13178,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -13214,8 +13212,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -13247,8 +13245,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13283,8 +13281,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -13315,8 +13313,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -13348,8 +13346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -13380,8 +13378,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13418,8 +13416,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -13451,8 +13449,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -13485,8 +13483,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -13518,8 +13516,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13553,8 +13551,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -13583,8 +13581,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -13614,8 +13612,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -13644,8 +13642,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 @@ -13680,8 +13678,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -13712,8 +13710,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -13745,8 +13743,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -13777,8 +13775,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13815,8 +13813,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -13848,8 +13846,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -13882,8 +13880,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 3 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -13915,8 +13913,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 3 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 @@ -13954,8 +13952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 4 ; GFX9-NEXT: v_readlane_b32 s30, v40, 3 +; GFX9-NEXT: v_readlane_b32 s31, v40, 4 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 @@ -13990,8 +13988,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 3 ; GFX10-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 @@ -14027,8 +14025,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 4 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 4 ; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 @@ -14063,8 +14061,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 4 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 @@ -14105,8 +14103,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -14144,8 +14142,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -14184,8 +14182,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 5 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 @@ -14223,8 +14221,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 @@ -14263,8 +14261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -14299,8 +14297,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -14336,8 +14334,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 5 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 @@ -14372,8 +14370,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 @@ -14416,8 +14414,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s30, v40, 4 +; GFX9-NEXT: v_readlane_b32 s31, v40, 5 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 @@ -14455,8 +14453,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 @@ -14495,8 +14493,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 5 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 @@ -14534,8 +14532,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 5 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 5 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 @@ -14579,8 +14577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 6 ; GFX9-NEXT: v_readlane_b32 s30, v40, 5 +; GFX9-NEXT: v_readlane_b32 s31, v40, 6 ; GFX9-NEXT: v_readlane_b32 s8, v40, 4 ; GFX9-NEXT: v_readlane_b32 s7, v40, 3 ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 @@ -14621,8 +14619,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 5 ; GFX10-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 @@ -14664,8 +14662,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 6 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 6 ; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 ; GFX11-NEXT: v_readlane_b32 s8, v40, 4 ; GFX11-NEXT: v_readlane_b32 s7, v40, 3 ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 @@ -14706,8 +14704,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 5 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 6 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s8, v40, 4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s7, v40, 3 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 @@ -14753,8 +14751,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s11, v40, 7 ; GFX9-NEXT: v_readlane_b32 s10, v40, 6 ; GFX9-NEXT: v_readlane_b32 s9, v40, 5 @@ -14799,8 +14797,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-NEXT: v_readlane_b32 s11, v40, 7 ; GFX10-NEXT: v_readlane_b32 s10, v40, 6 ; GFX10-NEXT: v_readlane_b32 s9, v40, 5 @@ -14846,8 +14844,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 9 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 9 ; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 ; GFX11-NEXT: v_readlane_b32 s11, v40, 7 ; GFX11-NEXT: v_readlane_b32 s10, v40, 6 ; GFX11-NEXT: v_readlane_b32 s9, v40, 5 @@ -14892,8 +14890,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 @@ -14949,8 +14947,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s30, v40, 8 +; GFX9-NEXT: v_readlane_b32 s31, v40, 9 ; GFX9-NEXT: v_readlane_b32 s11, v40, 7 ; GFX9-NEXT: v_readlane_b32 s10, v40, 6 ; GFX9-NEXT: v_readlane_b32 s9, v40, 5 @@ -15000,8 +14998,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-NEXT: v_readlane_b32 s11, v40, 7 ; GFX10-NEXT: v_readlane_b32 s10, v40, 6 ; GFX10-NEXT: v_readlane_b32 s9, v40, 5 @@ -15052,8 +15050,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 9 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 9 ; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 ; GFX11-NEXT: v_readlane_b32 s11, v40, 7 ; GFX11-NEXT: v_readlane_b32 s10, v40, 6 ; GFX11-NEXT: v_readlane_b32 s9, v40, 5 @@ -15103,8 +15101,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 8 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 9 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 9 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s11, v40, 7 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s10, v40, 6 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s9, v40, 5 @@ -15161,8 +15159,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 17 ; GFX9-NEXT: v_readlane_b32 s30, v40, 16 +; GFX9-NEXT: v_readlane_b32 s31, v40, 17 ; GFX9-NEXT: v_readlane_b32 s19, v40, 15 ; GFX9-NEXT: v_readlane_b32 s18, v40, 14 ; GFX9-NEXT: v_readlane_b32 s17, v40, 13 @@ -15223,8 +15221,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 17 ; GFX10-NEXT: v_readlane_b32 s30, v40, 16 +; GFX10-NEXT: v_readlane_b32 s31, v40, 17 ; GFX10-NEXT: v_readlane_b32 s19, v40, 15 ; GFX10-NEXT: v_readlane_b32 s18, v40, 14 ; GFX10-NEXT: v_readlane_b32 s17, v40, 13 @@ -15286,8 +15284,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 17 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 17 ; GFX11-NEXT: v_readlane_b32 s30, v40, 16 +; GFX11-NEXT: v_readlane_b32 s31, v40, 17 ; GFX11-NEXT: v_readlane_b32 s19, v40, 15 ; GFX11-NEXT: v_readlane_b32 s18, v40, 14 ; GFX11-NEXT: v_readlane_b32 s17, v40, 13 @@ -15348,8 +15346,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 17 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 16 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 17 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s19, v40, 15 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s18, v40, 14 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s17, v40, 13 @@ -15450,8 +15448,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX9-NEXT: v_writelane_b32 v40, s31, 27 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 +; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 ; GFX9-NEXT: v_readlane_b32 s28, v40, 24 ; GFX9-NEXT: v_readlane_b32 s27, v40, 23 @@ -15557,8 +15555,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 +; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s29, v40, 25 ; GFX10-NEXT: v_readlane_b32 s28, v40, 24 ; GFX10-NEXT: v_readlane_b32 s27, v40, 23 @@ -15660,8 +15658,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 +; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 ; GFX11-NEXT: v_readlane_b32 s28, v40, 24 ; GFX11-NEXT: v_readlane_b32 s27, v40, 23 @@ -15764,8 +15762,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23 @@ -15881,8 +15879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s31, 27 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s30, v40, 26 +; GFX9-NEXT: v_readlane_b32 s31, v40, 27 ; GFX9-NEXT: v_readlane_b32 s29, v40, 25 ; GFX9-NEXT: v_readlane_b32 s28, v40, 24 ; GFX9-NEXT: v_readlane_b32 s27, v40, 23 @@ -15993,8 +15991,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-NEXT: v_writelane_b32 v40, s31, 27 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s30, v40, 26 +; GFX10-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-NEXT: v_readlane_b32 s29, v40, 25 ; GFX10-NEXT: v_readlane_b32 s28, v40, 24 ; GFX10-NEXT: v_readlane_b32 s27, v40, 23 @@ -16100,8 +16098,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 27 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s30, v40, 26 +; GFX11-NEXT: v_readlane_b32 s31, v40, 27 ; GFX11-NEXT: v_readlane_b32 s29, v40, 25 ; GFX11-NEXT: v_readlane_b32 s28, v40, 24 ; GFX11-NEXT: v_readlane_b32 s27, v40, 23 @@ -16210,8 +16208,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 26 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 27 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 26 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 27 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s29, v40, 25 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s28, v40, 24 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s27, v40, 23 @@ -16276,8 +16274,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -16310,8 +16308,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -16340,8 +16338,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -16370,8 +16368,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -16442,8 +16440,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v31, 11 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -16509,8 +16507,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_12xv3i32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -16556,8 +16554,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -16620,8 +16618,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_12xv3i32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -16712,8 +16710,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v31, 7 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -16787,8 +16785,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5i32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -16838,8 +16836,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -16908,8 +16906,8 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5i32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -16996,8 +16994,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17071,8 +17069,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-NEXT: s_mov_b32 s34, external_void_func_8xv5f32@abs32@lo ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17127,8 +17125,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17197,8 +17195,8 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_8xv5f32@abs32@lo ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17237,8 +17235,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17264,8 +17262,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17292,8 +17290,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17319,8 +17317,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17351,8 +17349,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17378,8 +17376,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17406,8 +17404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17433,8 +17431,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17465,8 +17463,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17492,8 +17490,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17520,8 +17518,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17547,8 +17545,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17579,8 +17577,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17606,8 +17604,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17634,8 +17632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17661,8 +17659,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17693,8 +17691,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17720,8 +17718,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17748,8 +17746,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17775,8 +17773,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17807,8 +17805,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17834,8 +17832,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17862,8 +17860,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -17889,8 +17887,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -17921,8 +17919,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -17948,8 +17946,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -17976,8 +17974,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18003,8 +18001,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18035,8 +18033,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18062,8 +18060,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18090,8 +18088,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18117,8 +18115,8 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18149,8 +18147,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18176,8 +18174,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18204,8 +18202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18231,8 +18229,8 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18263,8 +18261,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18290,8 +18288,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18318,8 +18316,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18345,8 +18343,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18377,8 +18375,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18404,8 +18402,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18432,8 +18430,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18459,8 +18457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18491,8 +18489,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18518,8 +18516,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18546,8 +18544,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18573,8 +18571,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18605,8 +18603,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18632,8 +18630,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18660,8 +18658,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18687,8 +18685,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 @@ -18719,8 +18717,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -18746,8 +18744,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -18774,8 +18772,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -18801,8 +18799,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s32, s33 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll index 124de7e00f020..ecf2b33841076 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -26,8 +26,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 +; GFX9-NEXT: v_readlane_b32 s31, v40, 3 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 @@ -60,8 +60,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 +; GFX10-NEXT: v_readlane_b32 s31, v40, 3 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 @@ -95,8 +95,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -130,8 +130,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_readlane_b32 s31, v0, 3 ; GFX9-NEXT: v_readlane_b32 s30, v0, 2 +; GFX9-NEXT: v_readlane_b32 s31, v0, 3 ; GFX9-NEXT: v_readlane_b32 s29, v0, 1 ; GFX9-NEXT: v_readlane_b32 s28, v0, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 @@ -157,8 +157,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_readlane_b32 s31, v0, 3 ; GFX10-NEXT: v_readlane_b32 s30, v0, 2 +; GFX10-NEXT: v_readlane_b32 s31, v0, 3 ; GFX10-NEXT: v_readlane_b32 s29, v0, 1 ; GFX10-NEXT: v_readlane_b32 s28, v0, 0 ; GFX10-NEXT: s_xor_saveexec_b32 s34, -1 @@ -185,8 +185,8 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v0, 3 ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 +; GFX11-NEXT: v_readlane_b32 s31, v0, 3 ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 @@ -224,8 +224,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -261,8 +261,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -298,8 +298,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -341,8 +341,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -378,8 +378,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -416,8 +416,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) ; GFX11-NEXT: ; use v31 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -455,11 +455,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s33 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -492,11 +492,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s33 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -529,12 +529,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s33, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s33 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -572,11 +572,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: s_mov_b32 s34, s4 +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s34 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -609,11 +609,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s34, s4 +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s34 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -645,13 +645,13 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: s_mov_b32 s34, s4 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s34 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -691,8 +691,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX9-NEXT: ; use v40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 +; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -726,8 +726,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX10-NEXT: ; use v40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 +; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -761,8 +761,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) ; GFX11-NEXT: ; use v40 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -849,8 +849,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -876,8 +876,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -904,8 +904,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -934,8 +934,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 @@ -961,8 +961,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 @@ -989,8 +989,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -1024,11 +1024,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s4, s40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v40, 3 @@ -1060,11 +1060,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v40, 3 @@ -1096,12 +1096,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s4 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v40, 3 @@ -1150,8 +1150,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX9-NEXT: ; use v40 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v41, 2 ; GFX9-NEXT: v_readlane_b32 s30, v41, 1 +; GFX9-NEXT: v_readlane_b32 s31, v41, 2 ; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s34, v41, 3 @@ -1195,8 +1195,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX10-NEXT: ; use v40 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v41, 2 ; GFX10-NEXT: v_readlane_b32 s30, v41, 1 +; GFX10-NEXT: v_readlane_b32 s31, v41, 2 ; GFX10-NEXT: v_readlane_b32 s4, v41, 0 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s34, v41, 3 @@ -1240,8 +1240,8 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { ; GFX11-NEXT: ; use v40 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v41, 2 ; GFX11-NEXT: v_readlane_b32 s30, v41, 1 +; GFX11-NEXT: v_readlane_b32 s31, v41, 2 ; GFX11-NEXT: v_readlane_b32 s4, v41, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v41, 3 diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll index d8df20eb69452..4c7bef4aec091 100644 --- a/llvm/test/CodeGen/AMDGPU/global-alias.ll +++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll @@ -35,8 +35,8 @@ define void @bar() { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 76f204dd0c16a..ae13753004c3d 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -9,28 +9,30 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v6, s30, 0 -; CHECK-NEXT: v_writelane_b32 v6, s31, 1 -; CHECK-NEXT: v_writelane_b32 v6, s36, 2 -; CHECK-NEXT: v_writelane_b32 v6, s37, 3 -; CHECK-NEXT: v_writelane_b32 v6, s38, 4 -; CHECK-NEXT: v_writelane_b32 v6, s39, 5 -; CHECK-NEXT: v_writelane_b32 v6, s48, 6 -; CHECK-NEXT: v_writelane_b32 v6, s49, 7 -; CHECK-NEXT: v_writelane_b32 v6, s50, 8 -; CHECK-NEXT: v_writelane_b32 v6, s51, 9 -; CHECK-NEXT: v_writelane_b32 v6, s52, 10 -; CHECK-NEXT: v_writelane_b32 v6, s53, 11 -; CHECK-NEXT: v_writelane_b32 v6, s54, 12 -; CHECK-NEXT: v_writelane_b32 v6, s55, 13 -; CHECK-NEXT: v_writelane_b32 v6, s64, 14 -; CHECK-NEXT: v_writelane_b32 v6, s65, 15 -; CHECK-NEXT: v_writelane_b32 v6, s66, 16 -; CHECK-NEXT: v_writelane_b32 v6, s67, 17 -; CHECK-NEXT: v_writelane_b32 v6, s68, 18 +; CHECK-NEXT: v_writelane_b32 v6, s36, 0 +; CHECK-NEXT: v_writelane_b32 v6, s37, 1 +; CHECK-NEXT: v_writelane_b32 v6, s38, 2 +; CHECK-NEXT: v_writelane_b32 v6, s39, 3 +; CHECK-NEXT: v_writelane_b32 v6, s48, 4 +; CHECK-NEXT: v_writelane_b32 v6, s49, 5 +; CHECK-NEXT: v_writelane_b32 v6, s50, 6 +; CHECK-NEXT: v_writelane_b32 v6, s51, 7 +; CHECK-NEXT: v_writelane_b32 v6, s52, 8 +; CHECK-NEXT: v_writelane_b32 v6, s53, 9 +; CHECK-NEXT: v_writelane_b32 v6, s54, 10 +; CHECK-NEXT: v_writelane_b32 v6, s55, 11 +; CHECK-NEXT: v_writelane_b32 v6, s64, 12 +; CHECK-NEXT: v_writelane_b32 v6, s65, 13 +; CHECK-NEXT: v_writelane_b32 v6, s66, 14 +; CHECK-NEXT: v_writelane_b32 v6, s67, 15 +; CHECK-NEXT: v_writelane_b32 v6, s68, 16 +; CHECK-NEXT: v_writelane_b32 v6, s69, 17 +; CHECK-NEXT: v_writelane_b32 v6, s70, 18 +; CHECK-NEXT: v_writelane_b32 v6, s71, 19 +; CHECK-NEXT: v_writelane_b32 v6, s30, 20 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_mov_b64 s[8:9], 0 -; CHECK-NEXT: v_writelane_b32 v6, s69, 19 +; CHECK-NEXT: v_writelane_b32 v6, s31, 21 ; CHECK-NEXT: s_mov_b32 s68, 0 ; CHECK-NEXT: s_mov_b32 s69, s4 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 @@ -40,11 +42,11 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v6, s70, 20 -; CHECK-NEXT: v_writelane_b32 v6, s71, 21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 +; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: v_writelane_b32 v7, s8, 0 ; CHECK-NEXT: v_writelane_b32 v7, s9, 1 ; CHECK-NEXT: v_writelane_b32 v7, s10, 2 @@ -77,9 +79,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s65, 29 ; CHECK-NEXT: v_writelane_b32 v7, s66, 30 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 -; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: v_writelane_b32 v7, s67, 31 ; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 @@ -225,29 +225,29 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_readlane_b32 s71, v6, 21 -; CHECK-NEXT: v_readlane_b32 s70, v6, 20 -; CHECK-NEXT: v_readlane_b32 s69, v6, 19 -; CHECK-NEXT: v_readlane_b32 s68, v6, 18 -; CHECK-NEXT: v_readlane_b32 s67, v6, 17 -; CHECK-NEXT: v_readlane_b32 s66, v6, 16 -; CHECK-NEXT: v_readlane_b32 s65, v6, 15 -; CHECK-NEXT: v_readlane_b32 s64, v6, 14 -; CHECK-NEXT: v_readlane_b32 s55, v6, 13 -; CHECK-NEXT: v_readlane_b32 s54, v6, 12 -; CHECK-NEXT: v_readlane_b32 s53, v6, 11 -; CHECK-NEXT: v_readlane_b32 s52, v6, 10 +; CHECK-NEXT: v_readlane_b32 s30, v6, 20 +; CHECK-NEXT: v_readlane_b32 s31, v6, 21 +; CHECK-NEXT: v_readlane_b32 s71, v6, 19 +; CHECK-NEXT: v_readlane_b32 s70, v6, 18 +; CHECK-NEXT: v_readlane_b32 s69, v6, 17 +; CHECK-NEXT: v_readlane_b32 s68, v6, 16 +; CHECK-NEXT: v_readlane_b32 s67, v6, 15 +; CHECK-NEXT: v_readlane_b32 s66, v6, 14 +; CHECK-NEXT: v_readlane_b32 s65, v6, 13 +; CHECK-NEXT: v_readlane_b32 s64, v6, 12 +; CHECK-NEXT: v_readlane_b32 s55, v6, 11 +; CHECK-NEXT: v_readlane_b32 s54, v6, 10 +; CHECK-NEXT: v_readlane_b32 s53, v6, 9 +; CHECK-NEXT: v_readlane_b32 s52, v6, 8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s51, v6, 9 -; CHECK-NEXT: v_readlane_b32 s50, v6, 8 -; CHECK-NEXT: v_readlane_b32 s49, v6, 7 -; CHECK-NEXT: v_readlane_b32 s48, v6, 6 -; CHECK-NEXT: v_readlane_b32 s39, v6, 5 -; CHECK-NEXT: v_readlane_b32 s38, v6, 4 -; CHECK-NEXT: v_readlane_b32 s37, v6, 3 -; CHECK-NEXT: v_readlane_b32 s36, v6, 2 -; CHECK-NEXT: v_readlane_b32 s31, v6, 1 -; CHECK-NEXT: v_readlane_b32 s30, v6, 0 +; CHECK-NEXT: v_readlane_b32 s51, v6, 7 +; CHECK-NEXT: v_readlane_b32 s50, v6, 6 +; CHECK-NEXT: v_readlane_b32 s49, v6, 5 +; CHECK-NEXT: v_readlane_b32 s48, v6, 4 +; CHECK-NEXT: v_readlane_b32 s39, v6, 3 +; CHECK-NEXT: v_readlane_b32 s38, v6, 2 +; CHECK-NEXT: v_readlane_b32 s37, v6, 1 +; CHECK-NEXT: v_readlane_b32 s36, v6, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index a208cfdb197af..2aaaff1ecc407 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -128,24 +128,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 -; GCN-NEXT: v_writelane_b32 v40, s64, 16 -; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 +; GCN-NEXT: v_writelane_b32 v40, s30, 16 +; GCN-NEXT: v_writelane_b32 v40, s31, 17 ; GCN-NEXT: s_mov_b32 s50, s15 ; GCN-NEXT: s_mov_b32 s51, s14 ; GCN-NEXT: s_mov_b32 s52, s13 @@ -175,24 +175,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB2_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[54:55] -; GCN-NEXT: v_readlane_b32 s65, v40, 17 -; GCN-NEXT: v_readlane_b32 s64, v40, 16 -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 16 +; GCN-NEXT: v_readlane_b32 s31, v40, 17 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -212,24 +212,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 -; GISEL-NEXT: v_writelane_b32 v40, s64, 16 -; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 +; GISEL-NEXT: v_writelane_b32 v40, s30, 16 +; GISEL-NEXT: v_writelane_b32 v40, s31, 17 ; GISEL-NEXT: s_mov_b32 s50, s15 ; GISEL-NEXT: s_mov_b32 s51, s14 ; GISEL-NEXT: s_mov_b32 s52, s13 @@ -259,24 +259,24 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB2_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[54:55] -; GISEL-NEXT: v_readlane_b32 s65, v40, 17 -; GISEL-NEXT: v_readlane_b32 s64, v40, 16 -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 16 +; GISEL-NEXT: v_readlane_b32 s31, v40, 17 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -300,24 +300,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 -; GCN-NEXT: v_writelane_b32 v40, s64, 16 -; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 +; GCN-NEXT: v_writelane_b32 v40, s30, 16 +; GCN-NEXT: v_writelane_b32 v40, s31, 17 ; GCN-NEXT: s_mov_b32 s50, s15 ; GCN-NEXT: s_mov_b32 s51, s14 ; GCN-NEXT: s_mov_b32 s52, s13 @@ -350,24 +350,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[54:55] -; GCN-NEXT: v_readlane_b32 s65, v40, 17 -; GCN-NEXT: v_readlane_b32 s64, v40, 16 -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 16 +; GCN-NEXT: v_readlane_b32 s31, v40, 17 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -387,24 +387,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 -; GISEL-NEXT: v_writelane_b32 v40, s64, 16 -; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 +; GISEL-NEXT: v_writelane_b32 v40, s30, 16 +; GISEL-NEXT: v_writelane_b32 v40, s31, 17 ; GISEL-NEXT: s_mov_b32 s50, s15 ; GISEL-NEXT: s_mov_b32 s51, s14 ; GISEL-NEXT: s_mov_b32 s52, s13 @@ -435,24 +435,24 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB3_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[54:55] -; GISEL-NEXT: v_readlane_b32 s65, v40, 17 -; GISEL-NEXT: v_readlane_b32 s64, v40, 16 -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 16 +; GISEL-NEXT: v_readlane_b32 s31, v40, 17 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -476,24 +476,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 -; GCN-NEXT: v_writelane_b32 v40, s64, 16 -; GCN-NEXT: v_writelane_b32 v40, s65, 17 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 +; GCN-NEXT: v_writelane_b32 v40, s30, 16 +; GCN-NEXT: v_writelane_b32 v40, s31, 17 ; GCN-NEXT: s_mov_b32 s50, s15 ; GCN-NEXT: s_mov_b32 s51, s14 ; GCN-NEXT: s_mov_b32 s52, s13 @@ -525,24 +525,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[54:55] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v2 -; GCN-NEXT: v_readlane_b32 s65, v40, 17 -; GCN-NEXT: v_readlane_b32 s64, v40, 16 -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 16 +; GCN-NEXT: v_readlane_b32 s31, v40, 17 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -562,24 +562,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 -; GISEL-NEXT: v_writelane_b32 v40, s64, 16 -; GISEL-NEXT: v_writelane_b32 v40, s65, 17 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 +; GISEL-NEXT: v_writelane_b32 v40, s30, 16 +; GISEL-NEXT: v_writelane_b32 v40, s31, 17 ; GISEL-NEXT: s_mov_b32 s50, s15 ; GISEL-NEXT: s_mov_b32 s51, s14 ; GISEL-NEXT: s_mov_b32 s52, s13 @@ -611,24 +611,24 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[54:55] ; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v1 -; GISEL-NEXT: v_readlane_b32 s65, v40, 17 -; GISEL-NEXT: v_readlane_b32 s64, v40, 16 -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 16 +; GISEL-NEXT: v_readlane_b32 s31, v40, 17 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -653,26 +653,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v40, s16, 20 ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 -; GCN-NEXT: v_writelane_b32 v40, s64, 16 -; GCN-NEXT: v_writelane_b32 v40, s65, 17 -; GCN-NEXT: v_writelane_b32 v40, s66, 18 -; GCN-NEXT: v_writelane_b32 v40, s67, 19 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s64, 14 +; GCN-NEXT: v_writelane_b32 v40, s65, 15 +; GCN-NEXT: v_writelane_b32 v40, s66, 16 +; GCN-NEXT: v_writelane_b32 v40, s67, 17 +; GCN-NEXT: v_writelane_b32 v40, s30, 18 +; GCN-NEXT: v_writelane_b32 v40, s31, 19 ; GCN-NEXT: s_mov_b32 s50, s15 ; GCN-NEXT: s_mov_b32 s51, s14 ; GCN-NEXT: s_mov_b32 s52, s13 @@ -709,26 +709,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GCN-NEXT: s_mov_b64 exec, s[64:65] ; GCN-NEXT: .LBB5_4: ; %bb2 ; GCN-NEXT: s_or_b64 exec, exec, s[54:55] -; GCN-NEXT: v_readlane_b32 s67, v40, 19 -; GCN-NEXT: v_readlane_b32 s66, v40, 18 -; GCN-NEXT: v_readlane_b32 s65, v40, 17 -; GCN-NEXT: v_readlane_b32 s64, v40, 16 -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 18 +; GCN-NEXT: v_readlane_b32 s31, v40, 19 +; GCN-NEXT: v_readlane_b32 s67, v40, 17 +; GCN-NEXT: v_readlane_b32 s66, v40, 16 +; GCN-NEXT: v_readlane_b32 s65, v40, 15 +; GCN-NEXT: v_readlane_b32 s64, v40, 14 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 20 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -748,26 +748,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v40, s16, 20 ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 -; GISEL-NEXT: v_writelane_b32 v40, s64, 16 -; GISEL-NEXT: v_writelane_b32 v40, s65, 17 -; GISEL-NEXT: v_writelane_b32 v40, s66, 18 -; GISEL-NEXT: v_writelane_b32 v40, s67, 19 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s64, 14 +; GISEL-NEXT: v_writelane_b32 v40, s65, 15 +; GISEL-NEXT: v_writelane_b32 v40, s66, 16 +; GISEL-NEXT: v_writelane_b32 v40, s67, 17 +; GISEL-NEXT: v_writelane_b32 v40, s30, 18 +; GISEL-NEXT: v_writelane_b32 v40, s31, 19 ; GISEL-NEXT: s_mov_b32 s50, s15 ; GISEL-NEXT: s_mov_b32 s51, s14 ; GISEL-NEXT: s_mov_b32 s52, s13 @@ -804,26 +804,26 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { ; GISEL-NEXT: s_mov_b64 exec, s[64:65] ; GISEL-NEXT: .LBB5_4: ; %bb2 ; GISEL-NEXT: s_or_b64 exec, exec, s[54:55] -; GISEL-NEXT: v_readlane_b32 s67, v40, 19 -; GISEL-NEXT: v_readlane_b32 s66, v40, 18 -; GISEL-NEXT: v_readlane_b32 s65, v40, 17 -; GISEL-NEXT: v_readlane_b32 s64, v40, 16 -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 18 +; GISEL-NEXT: v_readlane_b32 s31, v40, 19 +; GISEL-NEXT: v_readlane_b32 s67, v40, 17 +; GISEL-NEXT: v_readlane_b32 s66, v40, 16 +; GISEL-NEXT: v_readlane_b32 s65, v40, 15 +; GISEL-NEXT: v_readlane_b32 s64, v40, 14 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s4, v40, 20 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -853,22 +853,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s30, 14 +; GCN-NEXT: v_writelane_b32 v40, s31, 15 ; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 @@ -882,22 +882,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 14 +; GCN-NEXT: v_readlane_b32 s31, v40, 15 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload @@ -915,22 +915,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s30, 14 +; GISEL-NEXT: v_writelane_b32 v40, s31, 15 ; GISEL-NEXT: s_mov_b64 s[6:7], exec ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 @@ -944,22 +944,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB6_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[6:7] -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 14 +; GISEL-NEXT: v_readlane_b32 s31, v40, 15 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload @@ -982,22 +982,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v41, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s31, 1 -; GCN-NEXT: v_writelane_b32 v41, s34, 2 -; GCN-NEXT: v_writelane_b32 v41, s35, 3 -; GCN-NEXT: v_writelane_b32 v41, s36, 4 -; GCN-NEXT: v_writelane_b32 v41, s37, 5 -; GCN-NEXT: v_writelane_b32 v41, s38, 6 -; GCN-NEXT: v_writelane_b32 v41, s39, 7 -; GCN-NEXT: v_writelane_b32 v41, s48, 8 -; GCN-NEXT: v_writelane_b32 v41, s49, 9 -; GCN-NEXT: v_writelane_b32 v41, s50, 10 -; GCN-NEXT: v_writelane_b32 v41, s51, 11 -; GCN-NEXT: v_writelane_b32 v41, s52, 12 -; GCN-NEXT: v_writelane_b32 v41, s53, 13 -; GCN-NEXT: v_writelane_b32 v41, s54, 14 -; GCN-NEXT: v_writelane_b32 v41, s55, 15 +; GCN-NEXT: v_writelane_b32 v41, s34, 0 +; GCN-NEXT: v_writelane_b32 v41, s35, 1 +; GCN-NEXT: v_writelane_b32 v41, s36, 2 +; GCN-NEXT: v_writelane_b32 v41, s37, 3 +; GCN-NEXT: v_writelane_b32 v41, s38, 4 +; GCN-NEXT: v_writelane_b32 v41, s39, 5 +; GCN-NEXT: v_writelane_b32 v41, s48, 6 +; GCN-NEXT: v_writelane_b32 v41, s49, 7 +; GCN-NEXT: v_writelane_b32 v41, s50, 8 +; GCN-NEXT: v_writelane_b32 v41, s51, 9 +; GCN-NEXT: v_writelane_b32 v41, s52, 10 +; GCN-NEXT: v_writelane_b32 v41, s53, 11 +; GCN-NEXT: v_writelane_b32 v41, s54, 12 +; GCN-NEXT: v_writelane_b32 v41, s55, 13 +; GCN-NEXT: v_writelane_b32 v41, s30, 14 +; GCN-NEXT: v_writelane_b32 v41, s31, 15 ; GCN-NEXT: v_mov_b32_e32 v40, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1013,22 +1013,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v40 -; GCN-NEXT: v_readlane_b32 s55, v41, 15 -; GCN-NEXT: v_readlane_b32 s54, v41, 14 -; GCN-NEXT: v_readlane_b32 s53, v41, 13 -; GCN-NEXT: v_readlane_b32 s52, v41, 12 -; GCN-NEXT: v_readlane_b32 s51, v41, 11 -; GCN-NEXT: v_readlane_b32 s50, v41, 10 -; GCN-NEXT: v_readlane_b32 s49, v41, 9 -; GCN-NEXT: v_readlane_b32 s48, v41, 8 -; GCN-NEXT: v_readlane_b32 s39, v41, 7 -; GCN-NEXT: v_readlane_b32 s38, v41, 6 -; GCN-NEXT: v_readlane_b32 s37, v41, 5 -; GCN-NEXT: v_readlane_b32 s36, v41, 4 -; GCN-NEXT: v_readlane_b32 s35, v41, 3 -; GCN-NEXT: v_readlane_b32 s34, v41, 2 -; GCN-NEXT: v_readlane_b32 s31, v41, 1 -; GCN-NEXT: v_readlane_b32 s30, v41, 0 +; GCN-NEXT: v_readlane_b32 s30, v41, 14 +; GCN-NEXT: v_readlane_b32 s31, v41, 15 +; GCN-NEXT: v_readlane_b32 s55, v41, 13 +; GCN-NEXT: v_readlane_b32 s54, v41, 12 +; GCN-NEXT: v_readlane_b32 s53, v41, 11 +; GCN-NEXT: v_readlane_b32 s52, v41, 10 +; GCN-NEXT: v_readlane_b32 s51, v41, 9 +; GCN-NEXT: v_readlane_b32 s50, v41, 8 +; GCN-NEXT: v_readlane_b32 s49, v41, 7 +; GCN-NEXT: v_readlane_b32 s48, v41, 6 +; GCN-NEXT: v_readlane_b32 s39, v41, 5 +; GCN-NEXT: v_readlane_b32 s38, v41, 4 +; GCN-NEXT: v_readlane_b32 s37, v41, 3 +; GCN-NEXT: v_readlane_b32 s36, v41, 2 +; GCN-NEXT: v_readlane_b32 s35, v41, 1 +; GCN-NEXT: v_readlane_b32 s34, v41, 0 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 @@ -1048,22 +1048,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: v_writelane_b32 v41, s30, 0 -; GISEL-NEXT: v_writelane_b32 v41, s31, 1 -; GISEL-NEXT: v_writelane_b32 v41, s34, 2 -; GISEL-NEXT: v_writelane_b32 v41, s35, 3 -; GISEL-NEXT: v_writelane_b32 v41, s36, 4 -; GISEL-NEXT: v_writelane_b32 v41, s37, 5 -; GISEL-NEXT: v_writelane_b32 v41, s38, 6 -; GISEL-NEXT: v_writelane_b32 v41, s39, 7 -; GISEL-NEXT: v_writelane_b32 v41, s48, 8 -; GISEL-NEXT: v_writelane_b32 v41, s49, 9 -; GISEL-NEXT: v_writelane_b32 v41, s50, 10 -; GISEL-NEXT: v_writelane_b32 v41, s51, 11 -; GISEL-NEXT: v_writelane_b32 v41, s52, 12 -; GISEL-NEXT: v_writelane_b32 v41, s53, 13 -; GISEL-NEXT: v_writelane_b32 v41, s54, 14 -; GISEL-NEXT: v_writelane_b32 v41, s55, 15 +; GISEL-NEXT: v_writelane_b32 v41, s34, 0 +; GISEL-NEXT: v_writelane_b32 v41, s35, 1 +; GISEL-NEXT: v_writelane_b32 v41, s36, 2 +; GISEL-NEXT: v_writelane_b32 v41, s37, 3 +; GISEL-NEXT: v_writelane_b32 v41, s38, 4 +; GISEL-NEXT: v_writelane_b32 v41, s39, 5 +; GISEL-NEXT: v_writelane_b32 v41, s48, 6 +; GISEL-NEXT: v_writelane_b32 v41, s49, 7 +; GISEL-NEXT: v_writelane_b32 v41, s50, 8 +; GISEL-NEXT: v_writelane_b32 v41, s51, 9 +; GISEL-NEXT: v_writelane_b32 v41, s52, 10 +; GISEL-NEXT: v_writelane_b32 v41, s53, 11 +; GISEL-NEXT: v_writelane_b32 v41, s54, 12 +; GISEL-NEXT: v_writelane_b32 v41, s55, 13 +; GISEL-NEXT: v_writelane_b32 v41, s30, 14 +; GISEL-NEXT: v_writelane_b32 v41, s31, 15 ; GISEL-NEXT: v_mov_b32_e32 v40, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 @@ -1079,22 +1079,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v40 -; GISEL-NEXT: v_readlane_b32 s55, v41, 15 -; GISEL-NEXT: v_readlane_b32 s54, v41, 14 -; GISEL-NEXT: v_readlane_b32 s53, v41, 13 -; GISEL-NEXT: v_readlane_b32 s52, v41, 12 -; GISEL-NEXT: v_readlane_b32 s51, v41, 11 -; GISEL-NEXT: v_readlane_b32 s50, v41, 10 -; GISEL-NEXT: v_readlane_b32 s49, v41, 9 -; GISEL-NEXT: v_readlane_b32 s48, v41, 8 -; GISEL-NEXT: v_readlane_b32 s39, v41, 7 -; GISEL-NEXT: v_readlane_b32 s38, v41, 6 -; GISEL-NEXT: v_readlane_b32 s37, v41, 5 -; GISEL-NEXT: v_readlane_b32 s36, v41, 4 -; GISEL-NEXT: v_readlane_b32 s35, v41, 3 -; GISEL-NEXT: v_readlane_b32 s34, v41, 2 -; GISEL-NEXT: v_readlane_b32 s31, v41, 1 -; GISEL-NEXT: v_readlane_b32 s30, v41, 0 +; GISEL-NEXT: v_readlane_b32 s30, v41, 14 +; GISEL-NEXT: v_readlane_b32 s31, v41, 15 +; GISEL-NEXT: v_readlane_b32 s55, v41, 13 +; GISEL-NEXT: v_readlane_b32 s54, v41, 12 +; GISEL-NEXT: v_readlane_b32 s53, v41, 11 +; GISEL-NEXT: v_readlane_b32 s52, v41, 10 +; GISEL-NEXT: v_readlane_b32 s51, v41, 9 +; GISEL-NEXT: v_readlane_b32 s50, v41, 8 +; GISEL-NEXT: v_readlane_b32 s49, v41, 7 +; GISEL-NEXT: v_readlane_b32 s48, v41, 6 +; GISEL-NEXT: v_readlane_b32 s39, v41, 5 +; GISEL-NEXT: v_readlane_b32 s38, v41, 4 +; GISEL-NEXT: v_readlane_b32 s37, v41, 3 +; GISEL-NEXT: v_readlane_b32 s36, v41, 2 +; GISEL-NEXT: v_readlane_b32 s35, v41, 1 +; GISEL-NEXT: v_readlane_b32 s34, v41, 0 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 @@ -1121,22 +1121,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s30, 14 +; GCN-NEXT: v_writelane_b32 v40, s31, 15 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v1 @@ -1152,22 +1152,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v3 -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 14 +; GCN-NEXT: v_readlane_b32 s31, v40, 15 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload @@ -1185,22 +1185,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s30, 14 +; GISEL-NEXT: v_writelane_b32 v40, s31, 15 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v1 @@ -1216,22 +1216,22 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: v_mov_b32_e32 v0, v2 -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 14 +; GISEL-NEXT: v_readlane_b32 s31, v40, 15 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload @@ -1254,22 +1254,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s48, 8 -; GCN-NEXT: v_writelane_b32 v40, s49, 9 -; GCN-NEXT: v_writelane_b32 v40, s50, 10 -; GCN-NEXT: v_writelane_b32 v40, s51, 11 -; GCN-NEXT: v_writelane_b32 v40, s52, 12 -; GCN-NEXT: v_writelane_b32 v40, s53, 13 -; GCN-NEXT: v_writelane_b32 v40, s54, 14 -; GCN-NEXT: v_writelane_b32 v40, s55, 15 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 +; GCN-NEXT: v_writelane_b32 v40, s36, 2 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s48, 6 +; GCN-NEXT: v_writelane_b32 v40, s49, 7 +; GCN-NEXT: v_writelane_b32 v40, s50, 8 +; GCN-NEXT: v_writelane_b32 v40, s51, 9 +; GCN-NEXT: v_writelane_b32 v40, s52, 10 +; GCN-NEXT: v_writelane_b32 v40, s53, 11 +; GCN-NEXT: v_writelane_b32 v40, s54, 12 +; GCN-NEXT: v_writelane_b32 v40, s55, 13 +; GCN-NEXT: v_writelane_b32 v40, s30, 14 +; GCN-NEXT: v_writelane_b32 v40, s31, 15 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s6, v0 @@ -1282,22 +1282,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GCN-NEXT: s_cbranch_execnz .LBB9_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_readlane_b32 s55, v40, 15 -; GCN-NEXT: v_readlane_b32 s54, v40, 14 -; GCN-NEXT: v_readlane_b32 s53, v40, 13 -; GCN-NEXT: v_readlane_b32 s52, v40, 12 -; GCN-NEXT: v_readlane_b32 s51, v40, 11 -; GCN-NEXT: v_readlane_b32 s50, v40, 10 -; GCN-NEXT: v_readlane_b32 s49, v40, 9 -; GCN-NEXT: v_readlane_b32 s48, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s30, v40, 14 +; GCN-NEXT: v_readlane_b32 s31, v40, 15 +; GCN-NEXT: v_readlane_b32 s55, v40, 13 +; GCN-NEXT: v_readlane_b32 s54, v40, 12 +; GCN-NEXT: v_readlane_b32 s53, v40, 11 +; GCN-NEXT: v_readlane_b32 s52, v40, 10 +; GCN-NEXT: v_readlane_b32 s51, v40, 9 +; GCN-NEXT: v_readlane_b32 s50, v40, 8 +; GCN-NEXT: v_readlane_b32 s49, v40, 7 +; GCN-NEXT: v_readlane_b32 s48, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 +; GCN-NEXT: v_readlane_b32 s36, v40, 2 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload @@ -1315,22 +1315,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s48, 8 -; GISEL-NEXT: v_writelane_b32 v40, s49, 9 -; GISEL-NEXT: v_writelane_b32 v40, s50, 10 -; GISEL-NEXT: v_writelane_b32 v40, s51, 11 -; GISEL-NEXT: v_writelane_b32 v40, s52, 12 -; GISEL-NEXT: v_writelane_b32 v40, s53, 13 -; GISEL-NEXT: v_writelane_b32 v40, s54, 14 -; GISEL-NEXT: v_writelane_b32 v40, s55, 15 +; GISEL-NEXT: v_writelane_b32 v40, s34, 0 +; GISEL-NEXT: v_writelane_b32 v40, s35, 1 +; GISEL-NEXT: v_writelane_b32 v40, s36, 2 +; GISEL-NEXT: v_writelane_b32 v40, s37, 3 +; GISEL-NEXT: v_writelane_b32 v40, s38, 4 +; GISEL-NEXT: v_writelane_b32 v40, s39, 5 +; GISEL-NEXT: v_writelane_b32 v40, s48, 6 +; GISEL-NEXT: v_writelane_b32 v40, s49, 7 +; GISEL-NEXT: v_writelane_b32 v40, s50, 8 +; GISEL-NEXT: v_writelane_b32 v40, s51, 9 +; GISEL-NEXT: v_writelane_b32 v40, s52, 10 +; GISEL-NEXT: v_writelane_b32 v40, s53, 11 +; GISEL-NEXT: v_writelane_b32 v40, s54, 12 +; GISEL-NEXT: v_writelane_b32 v40, s55, 13 +; GISEL-NEXT: v_writelane_b32 v40, s30, 14 +; GISEL-NEXT: v_writelane_b32 v40, s31, 15 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v0 @@ -1343,22 +1343,22 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { ; GISEL-NEXT: s_cbranch_execnz .LBB9_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_readlane_b32 s55, v40, 15 -; GISEL-NEXT: v_readlane_b32 s54, v40, 14 -; GISEL-NEXT: v_readlane_b32 s53, v40, 13 -; GISEL-NEXT: v_readlane_b32 s52, v40, 12 -; GISEL-NEXT: v_readlane_b32 s51, v40, 11 -; GISEL-NEXT: v_readlane_b32 s50, v40, 10 -; GISEL-NEXT: v_readlane_b32 s49, v40, 9 -; GISEL-NEXT: v_readlane_b32 s48, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s30, v40, 14 +; GISEL-NEXT: v_readlane_b32 s31, v40, 15 +; GISEL-NEXT: v_readlane_b32 s55, v40, 13 +; GISEL-NEXT: v_readlane_b32 s54, v40, 12 +; GISEL-NEXT: v_readlane_b32 s53, v40, 11 +; GISEL-NEXT: v_readlane_b32 s52, v40, 10 +; GISEL-NEXT: v_readlane_b32 s51, v40, 9 +; GISEL-NEXT: v_readlane_b32 s50, v40, 8 +; GISEL-NEXT: v_readlane_b32 s49, v40, 7 +; GISEL-NEXT: v_readlane_b32 s48, v40, 6 +; GISEL-NEXT: v_readlane_b32 s39, v40, 5 +; GISEL-NEXT: v_readlane_b32 s38, v40, 4 +; GISEL-NEXT: v_readlane_b32 s37, v40, 3 +; GISEL-NEXT: v_readlane_b32 s36, v40, 2 +; GISEL-NEXT: v_readlane_b32 s35, v40, 1 +; GISEL-NEXT: v_readlane_b32 s34, v40, 0 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index c3f391786f878..fcc43ffd0140e 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -35,8 +35,8 @@ define void @f0() { ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s31, v4, 1 ; GFX11-NEXT: v_readlane_b32 s30, v4, 0 +; GFX11-NEXT: v_readlane_b32 s31, v4, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index df784a6e00cfc..19869e85ec9d9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -7,13 +7,13 @@ define fastcc i32 @foo() #0 { ; CHECK-LABEL: name: foo ; CHECK: bb.0 (%ir-block.0): ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_WAITCNT 0 ; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32 ; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 @@ -26,8 +26,8 @@ define fastcc i32 @foo() #0 { ; CHECK-NEXT: BUFFER_GL1_INV implicit $exec ; CHECK-NEXT: BUFFER_GL0_INV implicit $exec ; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40 - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31 ; CHECK-NEXT: S_WAITCNT 49279 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo @@ -39,12 +39,12 @@ define fastcc i32 @foo() #0 { ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.DummyReturnBlock: + ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0, implicit-def $sgpr30_sgpr31 ; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1 - ; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0 ; CHECK-NEXT: $sgpr32 = S_MOV_B32 $sgpr33 ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2 ; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5 ; CHECK-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr4 ; CHECK-NEXT: S_WAITCNT 16240 diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 17581bcb61e99..8c2c16ccdc2a0 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -37,26 +37,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX7-NEXT: s_add_i32 s6, s32, 0x101100 ; GFX7-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v23, s30, 0 -; GFX7-NEXT: v_writelane_b32 v23, s31, 1 -; GFX7-NEXT: v_writelane_b32 v23, s33, 2 -; GFX7-NEXT: v_writelane_b32 v23, s34, 3 -; GFX7-NEXT: v_writelane_b32 v23, s35, 4 -; GFX7-NEXT: v_writelane_b32 v23, s36, 5 -; GFX7-NEXT: v_writelane_b32 v23, s37, 6 -; GFX7-NEXT: v_writelane_b32 v23, s38, 7 -; GFX7-NEXT: v_writelane_b32 v23, s39, 8 -; GFX7-NEXT: v_writelane_b32 v23, s48, 9 -; GFX7-NEXT: v_writelane_b32 v23, s49, 10 -; GFX7-NEXT: v_writelane_b32 v23, s50, 11 -; GFX7-NEXT: v_writelane_b32 v23, s51, 12 -; GFX7-NEXT: v_writelane_b32 v23, s52, 13 -; GFX7-NEXT: v_writelane_b32 v23, s53, 14 +; GFX7-NEXT: v_writelane_b32 v23, s33, 0 +; GFX7-NEXT: v_writelane_b32 v23, s34, 1 +; GFX7-NEXT: v_writelane_b32 v23, s35, 2 +; GFX7-NEXT: v_writelane_b32 v23, s36, 3 +; GFX7-NEXT: v_writelane_b32 v23, s37, 4 +; GFX7-NEXT: v_writelane_b32 v23, s38, 5 +; GFX7-NEXT: v_writelane_b32 v23, s39, 6 +; GFX7-NEXT: v_writelane_b32 v23, s48, 7 +; GFX7-NEXT: v_writelane_b32 v23, s49, 8 +; GFX7-NEXT: v_writelane_b32 v23, s50, 9 +; GFX7-NEXT: v_writelane_b32 v23, s51, 10 +; GFX7-NEXT: v_writelane_b32 v23, s52, 11 +; GFX7-NEXT: v_writelane_b32 v23, s53, 12 +; GFX7-NEXT: v_writelane_b32 v23, s54, 13 +; GFX7-NEXT: v_writelane_b32 v23, s55, 14 ; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 -; GFX7-NEXT: v_writelane_b32 v23, s54, 15 +; GFX7-NEXT: v_writelane_b32 v23, s30, 15 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec -; GFX7-NEXT: v_writelane_b32 v23, s55, 16 +; GFX7-NEXT: v_writelane_b32 v23, s31, 16 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use alloca0 v0 ; GFX7-NEXT: ;;#ASMEND @@ -73,23 +73,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s55, v23, 16 -; GFX7-NEXT: v_readlane_b32 s54, v23, 15 -; GFX7-NEXT: v_readlane_b32 s53, v23, 14 -; GFX7-NEXT: v_readlane_b32 s52, v23, 13 -; GFX7-NEXT: v_readlane_b32 s51, v23, 12 -; GFX7-NEXT: v_readlane_b32 s50, v23, 11 -; GFX7-NEXT: v_readlane_b32 s49, v23, 10 -; GFX7-NEXT: v_readlane_b32 s48, v23, 9 -; GFX7-NEXT: v_readlane_b32 s39, v23, 8 -; GFX7-NEXT: v_readlane_b32 s38, v23, 7 -; GFX7-NEXT: v_readlane_b32 s37, v23, 6 -; GFX7-NEXT: v_readlane_b32 s36, v23, 5 -; GFX7-NEXT: v_readlane_b32 s35, v23, 4 -; GFX7-NEXT: v_readlane_b32 s34, v23, 3 -; GFX7-NEXT: v_readlane_b32 s33, v23, 2 -; GFX7-NEXT: v_readlane_b32 s31, v23, 1 -; GFX7-NEXT: v_readlane_b32 s30, v23, 0 +; GFX7-NEXT: v_readlane_b32 s30, v23, 15 +; GFX7-NEXT: v_readlane_b32 s31, v23, 16 +; GFX7-NEXT: v_readlane_b32 s55, v23, 14 +; GFX7-NEXT: v_readlane_b32 s54, v23, 13 +; GFX7-NEXT: v_readlane_b32 s53, v23, 12 +; GFX7-NEXT: v_readlane_b32 s52, v23, 11 +; GFX7-NEXT: v_readlane_b32 s51, v23, 10 +; GFX7-NEXT: v_readlane_b32 s50, v23, 9 +; GFX7-NEXT: v_readlane_b32 s49, v23, 8 +; GFX7-NEXT: v_readlane_b32 s48, v23, 7 +; GFX7-NEXT: v_readlane_b32 s39, v23, 6 +; GFX7-NEXT: v_readlane_b32 s38, v23, 5 +; GFX7-NEXT: v_readlane_b32 s37, v23, 4 +; GFX7-NEXT: v_readlane_b32 s36, v23, 3 +; GFX7-NEXT: v_readlane_b32 s35, v23, 2 +; GFX7-NEXT: v_readlane_b32 s34, v23, 1 +; GFX7-NEXT: v_readlane_b32 s33, v23, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: s_add_i32 s6, s32, 0x101100 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -104,26 +104,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 ; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v23, s30, 0 -; GFX8-NEXT: v_writelane_b32 v23, s31, 1 -; GFX8-NEXT: v_writelane_b32 v23, s33, 2 -; GFX8-NEXT: v_writelane_b32 v23, s34, 3 -; GFX8-NEXT: v_writelane_b32 v23, s35, 4 -; GFX8-NEXT: v_writelane_b32 v23, s36, 5 -; GFX8-NEXT: v_writelane_b32 v23, s37, 6 -; GFX8-NEXT: v_writelane_b32 v23, s38, 7 -; GFX8-NEXT: v_writelane_b32 v23, s39, 8 -; GFX8-NEXT: v_writelane_b32 v23, s48, 9 -; GFX8-NEXT: v_writelane_b32 v23, s49, 10 -; GFX8-NEXT: v_writelane_b32 v23, s50, 11 -; GFX8-NEXT: v_writelane_b32 v23, s51, 12 -; GFX8-NEXT: v_writelane_b32 v23, s52, 13 -; GFX8-NEXT: v_writelane_b32 v23, s53, 14 +; GFX8-NEXT: v_writelane_b32 v23, s33, 0 +; GFX8-NEXT: v_writelane_b32 v23, s34, 1 +; GFX8-NEXT: v_writelane_b32 v23, s35, 2 +; GFX8-NEXT: v_writelane_b32 v23, s36, 3 +; GFX8-NEXT: v_writelane_b32 v23, s37, 4 +; GFX8-NEXT: v_writelane_b32 v23, s38, 5 +; GFX8-NEXT: v_writelane_b32 v23, s39, 6 +; GFX8-NEXT: v_writelane_b32 v23, s48, 7 +; GFX8-NEXT: v_writelane_b32 v23, s49, 8 +; GFX8-NEXT: v_writelane_b32 v23, s50, 9 +; GFX8-NEXT: v_writelane_b32 v23, s51, 10 +; GFX8-NEXT: v_writelane_b32 v23, s52, 11 +; GFX8-NEXT: v_writelane_b32 v23, s53, 12 +; GFX8-NEXT: v_writelane_b32 v23, s54, 13 +; GFX8-NEXT: v_writelane_b32 v23, s55, 14 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: v_writelane_b32 v23, s54, 15 +; GFX8-NEXT: v_writelane_b32 v23, s30, 15 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec -; GFX8-NEXT: v_writelane_b32 v23, s55, 16 +; GFX8-NEXT: v_writelane_b32 v23, s31, 16 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND @@ -141,23 +141,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s55, v23, 16 -; GFX8-NEXT: v_readlane_b32 s54, v23, 15 -; GFX8-NEXT: v_readlane_b32 s53, v23, 14 -; GFX8-NEXT: v_readlane_b32 s52, v23, 13 -; GFX8-NEXT: v_readlane_b32 s51, v23, 12 -; GFX8-NEXT: v_readlane_b32 s50, v23, 11 -; GFX8-NEXT: v_readlane_b32 s49, v23, 10 -; GFX8-NEXT: v_readlane_b32 s48, v23, 9 -; GFX8-NEXT: v_readlane_b32 s39, v23, 8 -; GFX8-NEXT: v_readlane_b32 s38, v23, 7 -; GFX8-NEXT: v_readlane_b32 s37, v23, 6 -; GFX8-NEXT: v_readlane_b32 s36, v23, 5 -; GFX8-NEXT: v_readlane_b32 s35, v23, 4 -; GFX8-NEXT: v_readlane_b32 s34, v23, 3 -; GFX8-NEXT: v_readlane_b32 s33, v23, 2 -; GFX8-NEXT: v_readlane_b32 s31, v23, 1 -; GFX8-NEXT: v_readlane_b32 s30, v23, 0 +; GFX8-NEXT: v_readlane_b32 s30, v23, 15 +; GFX8-NEXT: v_readlane_b32 s31, v23, 16 +; GFX8-NEXT: v_readlane_b32 s55, v23, 14 +; GFX8-NEXT: v_readlane_b32 s54, v23, 13 +; GFX8-NEXT: v_readlane_b32 s53, v23, 12 +; GFX8-NEXT: v_readlane_b32 s52, v23, 11 +; GFX8-NEXT: v_readlane_b32 s51, v23, 10 +; GFX8-NEXT: v_readlane_b32 s50, v23, 9 +; GFX8-NEXT: v_readlane_b32 s49, v23, 8 +; GFX8-NEXT: v_readlane_b32 s48, v23, 7 +; GFX8-NEXT: v_readlane_b32 s39, v23, 6 +; GFX8-NEXT: v_readlane_b32 s38, v23, 5 +; GFX8-NEXT: v_readlane_b32 s37, v23, 4 +; GFX8-NEXT: v_readlane_b32 s36, v23, 3 +; GFX8-NEXT: v_readlane_b32 s35, v23, 2 +; GFX8-NEXT: v_readlane_b32 s34, v23, 1 +; GFX8-NEXT: v_readlane_b32 s33, v23, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x101100 ; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -172,26 +172,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 ; GFX900-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v23, s30, 0 -; GFX900-NEXT: v_writelane_b32 v23, s31, 1 -; GFX900-NEXT: v_writelane_b32 v23, s33, 2 -; GFX900-NEXT: v_writelane_b32 v23, s34, 3 -; GFX900-NEXT: v_writelane_b32 v23, s35, 4 -; GFX900-NEXT: v_writelane_b32 v23, s36, 5 -; GFX900-NEXT: v_writelane_b32 v23, s37, 6 -; GFX900-NEXT: v_writelane_b32 v23, s38, 7 -; GFX900-NEXT: v_writelane_b32 v23, s39, 8 -; GFX900-NEXT: v_writelane_b32 v23, s48, 9 -; GFX900-NEXT: v_writelane_b32 v23, s49, 10 -; GFX900-NEXT: v_writelane_b32 v23, s50, 11 -; GFX900-NEXT: v_writelane_b32 v23, s51, 12 -; GFX900-NEXT: v_writelane_b32 v23, s52, 13 -; GFX900-NEXT: v_writelane_b32 v23, s53, 14 +; GFX900-NEXT: v_writelane_b32 v23, s33, 0 +; GFX900-NEXT: v_writelane_b32 v23, s34, 1 +; GFX900-NEXT: v_writelane_b32 v23, s35, 2 +; GFX900-NEXT: v_writelane_b32 v23, s36, 3 +; GFX900-NEXT: v_writelane_b32 v23, s37, 4 +; GFX900-NEXT: v_writelane_b32 v23, s38, 5 +; GFX900-NEXT: v_writelane_b32 v23, s39, 6 +; GFX900-NEXT: v_writelane_b32 v23, s48, 7 +; GFX900-NEXT: v_writelane_b32 v23, s49, 8 +; GFX900-NEXT: v_writelane_b32 v23, s50, 9 +; GFX900-NEXT: v_writelane_b32 v23, s51, 10 +; GFX900-NEXT: v_writelane_b32 v23, s52, 11 +; GFX900-NEXT: v_writelane_b32 v23, s53, 12 +; GFX900-NEXT: v_writelane_b32 v23, s54, 13 +; GFX900-NEXT: v_writelane_b32 v23, s55, 14 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX900-NEXT: v_writelane_b32 v23, s54, 15 +; GFX900-NEXT: v_writelane_b32 v23, s30, 15 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec -; GFX900-NEXT: v_writelane_b32 v23, s55, 16 +; GFX900-NEXT: v_writelane_b32 v23, s31, 16 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND @@ -208,23 +208,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s55, v23, 16 -; GFX900-NEXT: v_readlane_b32 s54, v23, 15 -; GFX900-NEXT: v_readlane_b32 s53, v23, 14 -; GFX900-NEXT: v_readlane_b32 s52, v23, 13 -; GFX900-NEXT: v_readlane_b32 s51, v23, 12 -; GFX900-NEXT: v_readlane_b32 s50, v23, 11 -; GFX900-NEXT: v_readlane_b32 s49, v23, 10 -; GFX900-NEXT: v_readlane_b32 s48, v23, 9 -; GFX900-NEXT: v_readlane_b32 s39, v23, 8 -; GFX900-NEXT: v_readlane_b32 s38, v23, 7 -; GFX900-NEXT: v_readlane_b32 s37, v23, 6 -; GFX900-NEXT: v_readlane_b32 s36, v23, 5 -; GFX900-NEXT: v_readlane_b32 s35, v23, 4 -; GFX900-NEXT: v_readlane_b32 s34, v23, 3 -; GFX900-NEXT: v_readlane_b32 s33, v23, 2 -; GFX900-NEXT: v_readlane_b32 s31, v23, 1 -; GFX900-NEXT: v_readlane_b32 s30, v23, 0 +; GFX900-NEXT: v_readlane_b32 s30, v23, 15 +; GFX900-NEXT: v_readlane_b32 s31, v23, 16 +; GFX900-NEXT: v_readlane_b32 s55, v23, 14 +; GFX900-NEXT: v_readlane_b32 s54, v23, 13 +; GFX900-NEXT: v_readlane_b32 s53, v23, 12 +; GFX900-NEXT: v_readlane_b32 s52, v23, 11 +; GFX900-NEXT: v_readlane_b32 s51, v23, 10 +; GFX900-NEXT: v_readlane_b32 s50, v23, 9 +; GFX900-NEXT: v_readlane_b32 s49, v23, 8 +; GFX900-NEXT: v_readlane_b32 s48, v23, 7 +; GFX900-NEXT: v_readlane_b32 s39, v23, 6 +; GFX900-NEXT: v_readlane_b32 s38, v23, 5 +; GFX900-NEXT: v_readlane_b32 s37, v23, 4 +; GFX900-NEXT: v_readlane_b32 s36, v23, 3 +; GFX900-NEXT: v_readlane_b32 s35, v23, 2 +; GFX900-NEXT: v_readlane_b32 s34, v23, 1 +; GFX900-NEXT: v_readlane_b32 s33, v23, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x101100 ; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -239,26 +239,26 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 ; GFX942-NEXT: scratch_store_dword off, v23, s2 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: v_writelane_b32 v23, s30, 0 -; GFX942-NEXT: v_writelane_b32 v23, s31, 1 -; GFX942-NEXT: v_writelane_b32 v23, s33, 2 -; GFX942-NEXT: v_writelane_b32 v23, s34, 3 -; GFX942-NEXT: v_writelane_b32 v23, s35, 4 -; GFX942-NEXT: v_writelane_b32 v23, s36, 5 -; GFX942-NEXT: v_writelane_b32 v23, s37, 6 -; GFX942-NEXT: v_writelane_b32 v23, s38, 7 -; GFX942-NEXT: v_writelane_b32 v23, s39, 8 -; GFX942-NEXT: v_writelane_b32 v23, s48, 9 -; GFX942-NEXT: v_writelane_b32 v23, s49, 10 -; GFX942-NEXT: v_writelane_b32 v23, s50, 11 -; GFX942-NEXT: v_writelane_b32 v23, s51, 12 -; GFX942-NEXT: v_writelane_b32 v23, s52, 13 -; GFX942-NEXT: v_writelane_b32 v23, s53, 14 +; GFX942-NEXT: v_writelane_b32 v23, s33, 0 +; GFX942-NEXT: v_writelane_b32 v23, s34, 1 +; GFX942-NEXT: v_writelane_b32 v23, s35, 2 +; GFX942-NEXT: v_writelane_b32 v23, s36, 3 +; GFX942-NEXT: v_writelane_b32 v23, s37, 4 +; GFX942-NEXT: v_writelane_b32 v23, s38, 5 +; GFX942-NEXT: v_writelane_b32 v23, s39, 6 +; GFX942-NEXT: v_writelane_b32 v23, s48, 7 +; GFX942-NEXT: v_writelane_b32 v23, s49, 8 +; GFX942-NEXT: v_writelane_b32 v23, s50, 9 +; GFX942-NEXT: v_writelane_b32 v23, s51, 10 +; GFX942-NEXT: v_writelane_b32 v23, s52, 11 +; GFX942-NEXT: v_writelane_b32 v23, s53, 12 +; GFX942-NEXT: v_writelane_b32 v23, s54, 13 +; GFX942-NEXT: v_writelane_b32 v23, s55, 14 ; GFX942-NEXT: s_add_i32 s0, s32, 64 -; GFX942-NEXT: v_writelane_b32 v23, s54, 15 +; GFX942-NEXT: v_writelane_b32 v23, s30, 15 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 ; GFX942-NEXT: s_and_b64 s[60:61], 0, exec -; GFX942-NEXT: v_writelane_b32 v23, s55, 16 +; GFX942-NEXT: v_writelane_b32 v23, s31, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND @@ -273,23 +273,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s55, v23, 16 -; GFX942-NEXT: v_readlane_b32 s54, v23, 15 -; GFX942-NEXT: v_readlane_b32 s53, v23, 14 -; GFX942-NEXT: v_readlane_b32 s52, v23, 13 -; GFX942-NEXT: v_readlane_b32 s51, v23, 12 -; GFX942-NEXT: v_readlane_b32 s50, v23, 11 -; GFX942-NEXT: v_readlane_b32 s49, v23, 10 -; GFX942-NEXT: v_readlane_b32 s48, v23, 9 -; GFX942-NEXT: v_readlane_b32 s39, v23, 8 -; GFX942-NEXT: v_readlane_b32 s38, v23, 7 -; GFX942-NEXT: v_readlane_b32 s37, v23, 6 -; GFX942-NEXT: v_readlane_b32 s36, v23, 5 -; GFX942-NEXT: v_readlane_b32 s35, v23, 4 -; GFX942-NEXT: v_readlane_b32 s34, v23, 3 -; GFX942-NEXT: v_readlane_b32 s33, v23, 2 -; GFX942-NEXT: v_readlane_b32 s31, v23, 1 -; GFX942-NEXT: v_readlane_b32 s30, v23, 0 +; GFX942-NEXT: v_readlane_b32 s30, v23, 15 +; GFX942-NEXT: v_readlane_b32 s31, v23, 16 +; GFX942-NEXT: v_readlane_b32 s55, v23, 14 +; GFX942-NEXT: v_readlane_b32 s54, v23, 13 +; GFX942-NEXT: v_readlane_b32 s53, v23, 12 +; GFX942-NEXT: v_readlane_b32 s52, v23, 11 +; GFX942-NEXT: v_readlane_b32 s51, v23, 10 +; GFX942-NEXT: v_readlane_b32 s50, v23, 9 +; GFX942-NEXT: v_readlane_b32 s49, v23, 8 +; GFX942-NEXT: v_readlane_b32 s48, v23, 7 +; GFX942-NEXT: v_readlane_b32 s39, v23, 6 +; GFX942-NEXT: v_readlane_b32 s38, v23, 5 +; GFX942-NEXT: v_readlane_b32 s37, v23, 4 +; GFX942-NEXT: v_readlane_b32 s36, v23, 3 +; GFX942-NEXT: v_readlane_b32 s35, v23, 2 +; GFX942-NEXT: v_readlane_b32 s34, v23, 1 +; GFX942-NEXT: v_readlane_b32 s33, v23, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: s_add_i32 s2, s32, 0x4044 ; GFX942-NEXT: scratch_load_dword v23, off, s2 ; 4-byte Folded Reload @@ -305,29 +305,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_1-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0 +; GFX10_1-NEXT: v_writelane_b32 v23, s33, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1 +; GFX10_1-NEXT: v_writelane_b32 v23, s34, 1 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3 -; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4 -; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5 -; GFX10_1-NEXT: v_writelane_b32 v23, s37, 6 -; GFX10_1-NEXT: v_writelane_b32 v23, s38, 7 -; GFX10_1-NEXT: v_writelane_b32 v23, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v23, s48, 9 -; GFX10_1-NEXT: v_writelane_b32 v23, s49, 10 -; GFX10_1-NEXT: v_writelane_b32 v23, s50, 11 -; GFX10_1-NEXT: v_writelane_b32 v23, s51, 12 -; GFX10_1-NEXT: v_writelane_b32 v23, s52, 13 -; GFX10_1-NEXT: v_writelane_b32 v23, s53, 14 -; GFX10_1-NEXT: v_writelane_b32 v23, s54, 15 -; GFX10_1-NEXT: v_writelane_b32 v23, s55, 16 +; GFX10_1-NEXT: v_writelane_b32 v23, s35, 2 +; GFX10_1-NEXT: v_writelane_b32 v23, s36, 3 +; GFX10_1-NEXT: v_writelane_b32 v23, s37, 4 +; GFX10_1-NEXT: v_writelane_b32 v23, s38, 5 +; GFX10_1-NEXT: v_writelane_b32 v23, s39, 6 +; GFX10_1-NEXT: v_writelane_b32 v23, s48, 7 +; GFX10_1-NEXT: v_writelane_b32 v23, s49, 8 +; GFX10_1-NEXT: v_writelane_b32 v23, s50, 9 +; GFX10_1-NEXT: v_writelane_b32 v23, s51, 10 +; GFX10_1-NEXT: v_writelane_b32 v23, s52, 11 +; GFX10_1-NEXT: v_writelane_b32 v23, s53, 12 +; GFX10_1-NEXT: v_writelane_b32 v23, s54, 13 +; GFX10_1-NEXT: v_writelane_b32 v23, s55, 14 +; GFX10_1-NEXT: v_writelane_b32 v23, s30, 15 +; GFX10_1-NEXT: v_writelane_b32 v23, s31, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX10_1-NEXT: ;;#ASMEND @@ -338,23 +338,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s55, v23, 16 -; GFX10_1-NEXT: v_readlane_b32 s54, v23, 15 -; GFX10_1-NEXT: v_readlane_b32 s53, v23, 14 -; GFX10_1-NEXT: v_readlane_b32 s52, v23, 13 -; GFX10_1-NEXT: v_readlane_b32 s51, v23, 12 -; GFX10_1-NEXT: v_readlane_b32 s50, v23, 11 -; GFX10_1-NEXT: v_readlane_b32 s49, v23, 10 -; GFX10_1-NEXT: v_readlane_b32 s48, v23, 9 -; GFX10_1-NEXT: v_readlane_b32 s39, v23, 8 -; GFX10_1-NEXT: v_readlane_b32 s38, v23, 7 -; GFX10_1-NEXT: v_readlane_b32 s37, v23, 6 -; GFX10_1-NEXT: v_readlane_b32 s36, v23, 5 -; GFX10_1-NEXT: v_readlane_b32 s35, v23, 4 -; GFX10_1-NEXT: v_readlane_b32 s34, v23, 3 -; GFX10_1-NEXT: v_readlane_b32 s33, v23, 2 -; GFX10_1-NEXT: v_readlane_b32 s31, v23, 1 -; GFX10_1-NEXT: v_readlane_b32 s30, v23, 0 +; GFX10_1-NEXT: v_readlane_b32 s30, v23, 15 +; GFX10_1-NEXT: v_readlane_b32 s31, v23, 16 +; GFX10_1-NEXT: v_readlane_b32 s55, v23, 14 +; GFX10_1-NEXT: v_readlane_b32 s54, v23, 13 +; GFX10_1-NEXT: v_readlane_b32 s53, v23, 12 +; GFX10_1-NEXT: v_readlane_b32 s52, v23, 11 +; GFX10_1-NEXT: v_readlane_b32 s51, v23, 10 +; GFX10_1-NEXT: v_readlane_b32 s50, v23, 9 +; GFX10_1-NEXT: v_readlane_b32 s49, v23, 8 +; GFX10_1-NEXT: v_readlane_b32 s48, v23, 7 +; GFX10_1-NEXT: v_readlane_b32 s39, v23, 6 +; GFX10_1-NEXT: v_readlane_b32 s38, v23, 5 +; GFX10_1-NEXT: v_readlane_b32 s37, v23, 4 +; GFX10_1-NEXT: v_readlane_b32 s36, v23, 3 +; GFX10_1-NEXT: v_readlane_b32 s35, v23, 2 +; GFX10_1-NEXT: v_readlane_b32 s34, v23, 1 +; GFX10_1-NEXT: v_readlane_b32 s33, v23, 0 ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80880 ; GFX10_1-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload @@ -370,29 +370,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 ; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0 +; GFX10_3-NEXT: v_writelane_b32 v23, s33, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1 +; GFX10_3-NEXT: v_writelane_b32 v23, s34, 1 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3 -; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4 -; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5 -; GFX10_3-NEXT: v_writelane_b32 v23, s37, 6 -; GFX10_3-NEXT: v_writelane_b32 v23, s38, 7 -; GFX10_3-NEXT: v_writelane_b32 v23, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v23, s48, 9 -; GFX10_3-NEXT: v_writelane_b32 v23, s49, 10 -; GFX10_3-NEXT: v_writelane_b32 v23, s50, 11 -; GFX10_3-NEXT: v_writelane_b32 v23, s51, 12 -; GFX10_3-NEXT: v_writelane_b32 v23, s52, 13 -; GFX10_3-NEXT: v_writelane_b32 v23, s53, 14 -; GFX10_3-NEXT: v_writelane_b32 v23, s54, 15 -; GFX10_3-NEXT: v_writelane_b32 v23, s55, 16 +; GFX10_3-NEXT: v_writelane_b32 v23, s35, 2 +; GFX10_3-NEXT: v_writelane_b32 v23, s36, 3 +; GFX10_3-NEXT: v_writelane_b32 v23, s37, 4 +; GFX10_3-NEXT: v_writelane_b32 v23, s38, 5 +; GFX10_3-NEXT: v_writelane_b32 v23, s39, 6 +; GFX10_3-NEXT: v_writelane_b32 v23, s48, 7 +; GFX10_3-NEXT: v_writelane_b32 v23, s49, 8 +; GFX10_3-NEXT: v_writelane_b32 v23, s50, 9 +; GFX10_3-NEXT: v_writelane_b32 v23, s51, 10 +; GFX10_3-NEXT: v_writelane_b32 v23, s52, 11 +; GFX10_3-NEXT: v_writelane_b32 v23, s53, 12 +; GFX10_3-NEXT: v_writelane_b32 v23, s54, 13 +; GFX10_3-NEXT: v_writelane_b32 v23, s55, 14 +; GFX10_3-NEXT: v_writelane_b32 v23, s30, 15 +; GFX10_3-NEXT: v_writelane_b32 v23, s31, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX10_3-NEXT: ;;#ASMEND @@ -403,23 +403,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s55, v23, 16 -; GFX10_3-NEXT: v_readlane_b32 s54, v23, 15 -; GFX10_3-NEXT: v_readlane_b32 s53, v23, 14 -; GFX10_3-NEXT: v_readlane_b32 s52, v23, 13 -; GFX10_3-NEXT: v_readlane_b32 s51, v23, 12 -; GFX10_3-NEXT: v_readlane_b32 s50, v23, 11 -; GFX10_3-NEXT: v_readlane_b32 s49, v23, 10 -; GFX10_3-NEXT: v_readlane_b32 s48, v23, 9 -; GFX10_3-NEXT: v_readlane_b32 s39, v23, 8 -; GFX10_3-NEXT: v_readlane_b32 s38, v23, 7 -; GFX10_3-NEXT: v_readlane_b32 s37, v23, 6 -; GFX10_3-NEXT: v_readlane_b32 s36, v23, 5 -; GFX10_3-NEXT: v_readlane_b32 s35, v23, 4 -; GFX10_3-NEXT: v_readlane_b32 s34, v23, 3 -; GFX10_3-NEXT: v_readlane_b32 s33, v23, 2 -; GFX10_3-NEXT: v_readlane_b32 s31, v23, 1 -; GFX10_3-NEXT: v_readlane_b32 s30, v23, 0 +; GFX10_3-NEXT: v_readlane_b32 s30, v23, 15 +; GFX10_3-NEXT: v_readlane_b32 s31, v23, 16 +; GFX10_3-NEXT: v_readlane_b32 s55, v23, 14 +; GFX10_3-NEXT: v_readlane_b32 s54, v23, 13 +; GFX10_3-NEXT: v_readlane_b32 s53, v23, 12 +; GFX10_3-NEXT: v_readlane_b32 s52, v23, 11 +; GFX10_3-NEXT: v_readlane_b32 s51, v23, 10 +; GFX10_3-NEXT: v_readlane_b32 s50, v23, 9 +; GFX10_3-NEXT: v_readlane_b32 s49, v23, 8 +; GFX10_3-NEXT: v_readlane_b32 s48, v23, 7 +; GFX10_3-NEXT: v_readlane_b32 s39, v23, 6 +; GFX10_3-NEXT: v_readlane_b32 s38, v23, 5 +; GFX10_3-NEXT: v_readlane_b32 s37, v23, 4 +; GFX10_3-NEXT: v_readlane_b32 s36, v23, 3 +; GFX10_3-NEXT: v_readlane_b32 s35, v23, 2 +; GFX10_3-NEXT: v_readlane_b32 s34, v23, 1 +; GFX10_3-NEXT: v_readlane_b32 s33, v23, 0 ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80880 ; GFX10_3-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload @@ -434,30 +434,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 ; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v23, s30, 0 +; GFX11-NEXT: v_writelane_b32 v23, s33, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_writelane_b32 v23, s31, 1 +; GFX11-NEXT: v_writelane_b32 v23, s34, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v23, s33, 2 -; GFX11-NEXT: v_writelane_b32 v23, s34, 3 -; GFX11-NEXT: v_writelane_b32 v23, s35, 4 -; GFX11-NEXT: v_writelane_b32 v23, s36, 5 -; GFX11-NEXT: v_writelane_b32 v23, s37, 6 -; GFX11-NEXT: v_writelane_b32 v23, s38, 7 -; GFX11-NEXT: v_writelane_b32 v23, s39, 8 -; GFX11-NEXT: v_writelane_b32 v23, s48, 9 -; GFX11-NEXT: v_writelane_b32 v23, s49, 10 -; GFX11-NEXT: v_writelane_b32 v23, s50, 11 -; GFX11-NEXT: v_writelane_b32 v23, s51, 12 -; GFX11-NEXT: v_writelane_b32 v23, s52, 13 -; GFX11-NEXT: v_writelane_b32 v23, s53, 14 -; GFX11-NEXT: v_writelane_b32 v23, s54, 15 -; GFX11-NEXT: v_writelane_b32 v23, s55, 16 +; GFX11-NEXT: v_writelane_b32 v23, s35, 2 +; GFX11-NEXT: v_writelane_b32 v23, s36, 3 +; GFX11-NEXT: v_writelane_b32 v23, s37, 4 +; GFX11-NEXT: v_writelane_b32 v23, s38, 5 +; GFX11-NEXT: v_writelane_b32 v23, s39, 6 +; GFX11-NEXT: v_writelane_b32 v23, s48, 7 +; GFX11-NEXT: v_writelane_b32 v23, s49, 8 +; GFX11-NEXT: v_writelane_b32 v23, s50, 9 +; GFX11-NEXT: v_writelane_b32 v23, s51, 10 +; GFX11-NEXT: v_writelane_b32 v23, s52, 11 +; GFX11-NEXT: v_writelane_b32 v23, s53, 12 +; GFX11-NEXT: v_writelane_b32 v23, s54, 13 +; GFX11-NEXT: v_writelane_b32 v23, s55, 14 +; GFX11-NEXT: v_writelane_b32 v23, s30, 15 +; GFX11-NEXT: v_writelane_b32 v23, s31, 16 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX11-NEXT: ;;#ASMEND @@ -470,23 +470,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s55, v23, 16 -; GFX11-NEXT: v_readlane_b32 s54, v23, 15 -; GFX11-NEXT: v_readlane_b32 s53, v23, 14 -; GFX11-NEXT: v_readlane_b32 s52, v23, 13 -; GFX11-NEXT: v_readlane_b32 s51, v23, 12 -; GFX11-NEXT: v_readlane_b32 s50, v23, 11 -; GFX11-NEXT: v_readlane_b32 s49, v23, 10 -; GFX11-NEXT: v_readlane_b32 s48, v23, 9 -; GFX11-NEXT: v_readlane_b32 s39, v23, 8 -; GFX11-NEXT: v_readlane_b32 s38, v23, 7 -; GFX11-NEXT: v_readlane_b32 s37, v23, 6 -; GFX11-NEXT: v_readlane_b32 s36, v23, 5 -; GFX11-NEXT: v_readlane_b32 s35, v23, 4 -; GFX11-NEXT: v_readlane_b32 s34, v23, 3 -; GFX11-NEXT: v_readlane_b32 s33, v23, 2 -; GFX11-NEXT: v_readlane_b32 s31, v23, 1 -; GFX11-NEXT: v_readlane_b32 s30, v23, 0 +; GFX11-NEXT: v_readlane_b32 s30, v23, 15 +; GFX11-NEXT: v_readlane_b32 s31, v23, 16 +; GFX11-NEXT: v_readlane_b32 s55, v23, 14 +; GFX11-NEXT: v_readlane_b32 s54, v23, 13 +; GFX11-NEXT: v_readlane_b32 s53, v23, 12 +; GFX11-NEXT: v_readlane_b32 s52, v23, 11 +; GFX11-NEXT: v_readlane_b32 s51, v23, 10 +; GFX11-NEXT: v_readlane_b32 s50, v23, 9 +; GFX11-NEXT: v_readlane_b32 s49, v23, 8 +; GFX11-NEXT: v_readlane_b32 s48, v23, 7 +; GFX11-NEXT: v_readlane_b32 s39, v23, 6 +; GFX11-NEXT: v_readlane_b32 s38, v23, 5 +; GFX11-NEXT: v_readlane_b32 s37, v23, 4 +; GFX11-NEXT: v_readlane_b32 s36, v23, 3 +; GFX11-NEXT: v_readlane_b32 s35, v23, 2 +; GFX11-NEXT: v_readlane_b32 s34, v23, 1 +; GFX11-NEXT: v_readlane_b32 s33, v23, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x4044 ; GFX11-NEXT: scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload @@ -505,28 +505,28 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:16388 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v23, s30, 0 +; GFX12-NEXT: v_writelane_b32 v23, s33, 0 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_writelane_b32 v23, s31, 1 -; GFX12-NEXT: v_writelane_b32 v23, s33, 2 -; GFX12-NEXT: v_writelane_b32 v23, s34, 3 -; GFX12-NEXT: v_writelane_b32 v23, s35, 4 -; GFX12-NEXT: v_writelane_b32 v23, s36, 5 -; GFX12-NEXT: v_writelane_b32 v23, s37, 6 -; GFX12-NEXT: v_writelane_b32 v23, s38, 7 -; GFX12-NEXT: v_writelane_b32 v23, s39, 8 -; GFX12-NEXT: v_writelane_b32 v23, s48, 9 -; GFX12-NEXT: v_writelane_b32 v23, s49, 10 -; GFX12-NEXT: v_writelane_b32 v23, s50, 11 -; GFX12-NEXT: v_writelane_b32 v23, s51, 12 -; GFX12-NEXT: v_writelane_b32 v23, s52, 13 -; GFX12-NEXT: v_writelane_b32 v23, s53, 14 -; GFX12-NEXT: v_writelane_b32 v23, s54, 15 -; GFX12-NEXT: v_writelane_b32 v23, s55, 16 +; GFX12-NEXT: v_writelane_b32 v23, s34, 1 +; GFX12-NEXT: v_writelane_b32 v23, s35, 2 +; GFX12-NEXT: v_writelane_b32 v23, s36, 3 +; GFX12-NEXT: v_writelane_b32 v23, s37, 4 +; GFX12-NEXT: v_writelane_b32 v23, s38, 5 +; GFX12-NEXT: v_writelane_b32 v23, s39, 6 +; GFX12-NEXT: v_writelane_b32 v23, s48, 7 +; GFX12-NEXT: v_writelane_b32 v23, s49, 8 +; GFX12-NEXT: v_writelane_b32 v23, s50, 9 +; GFX12-NEXT: v_writelane_b32 v23, s51, 10 +; GFX12-NEXT: v_writelane_b32 v23, s52, 11 +; GFX12-NEXT: v_writelane_b32 v23, s53, 12 +; GFX12-NEXT: v_writelane_b32 v23, s54, 13 +; GFX12-NEXT: v_writelane_b32 v23, s55, 14 +; GFX12-NEXT: v_writelane_b32 v23, s30, 15 +; GFX12-NEXT: v_writelane_b32 v23, s31, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc ; GFX12-NEXT: ;;#ASMEND @@ -540,23 +540,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:22], vcc, s54, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s55, v23, 16 -; GFX12-NEXT: v_readlane_b32 s54, v23, 15 -; GFX12-NEXT: v_readlane_b32 s53, v23, 14 -; GFX12-NEXT: v_readlane_b32 s52, v23, 13 -; GFX12-NEXT: v_readlane_b32 s51, v23, 12 -; GFX12-NEXT: v_readlane_b32 s50, v23, 11 -; GFX12-NEXT: v_readlane_b32 s49, v23, 10 -; GFX12-NEXT: v_readlane_b32 s48, v23, 9 -; GFX12-NEXT: v_readlane_b32 s39, v23, 8 -; GFX12-NEXT: v_readlane_b32 s38, v23, 7 -; GFX12-NEXT: v_readlane_b32 s37, v23, 6 -; GFX12-NEXT: v_readlane_b32 s36, v23, 5 -; GFX12-NEXT: v_readlane_b32 s35, v23, 4 -; GFX12-NEXT: v_readlane_b32 s34, v23, 3 -; GFX12-NEXT: v_readlane_b32 s33, v23, 2 -; GFX12-NEXT: v_readlane_b32 s31, v23, 1 -; GFX12-NEXT: v_readlane_b32 s30, v23, 0 +; GFX12-NEXT: v_readlane_b32 s30, v23, 15 +; GFX12-NEXT: v_readlane_b32 s31, v23, 16 +; GFX12-NEXT: v_readlane_b32 s55, v23, 14 +; GFX12-NEXT: v_readlane_b32 s54, v23, 13 +; GFX12-NEXT: v_readlane_b32 s53, v23, 12 +; GFX12-NEXT: v_readlane_b32 s52, v23, 11 +; GFX12-NEXT: v_readlane_b32 s51, v23, 10 +; GFX12-NEXT: v_readlane_b32 s50, v23, 9 +; GFX12-NEXT: v_readlane_b32 s49, v23, 8 +; GFX12-NEXT: v_readlane_b32 s48, v23, 7 +; GFX12-NEXT: v_readlane_b32 s39, v23, 6 +; GFX12-NEXT: v_readlane_b32 s38, v23, 5 +; GFX12-NEXT: v_readlane_b32 s37, v23, 4 +; GFX12-NEXT: v_readlane_b32 s36, v23, 3 +; GFX12-NEXT: v_readlane_b32 s35, v23, 2 +; GFX12-NEXT: v_readlane_b32 s34, v23, 1 +; GFX12-NEXT: v_readlane_b32 s33, v23, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:16388 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe @@ -613,24 +613,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX7-NEXT: s_add_i32 s6, s32, 0x100400 ; GFX7-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] -; GFX7-NEXT: v_writelane_b32 v21, s30, 0 -; GFX7-NEXT: v_writelane_b32 v21, s31, 1 -; GFX7-NEXT: v_writelane_b32 v21, s33, 2 -; GFX7-NEXT: v_writelane_b32 v21, s34, 3 -; GFX7-NEXT: v_writelane_b32 v21, s35, 4 -; GFX7-NEXT: v_writelane_b32 v21, s36, 5 -; GFX7-NEXT: v_writelane_b32 v21, s37, 6 -; GFX7-NEXT: v_writelane_b32 v21, s38, 7 -; GFX7-NEXT: v_writelane_b32 v21, s39, 8 -; GFX7-NEXT: v_writelane_b32 v21, s48, 9 -; GFX7-NEXT: v_writelane_b32 v21, s49, 10 -; GFX7-NEXT: v_writelane_b32 v21, s50, 11 -; GFX7-NEXT: v_writelane_b32 v21, s51, 12 -; GFX7-NEXT: v_writelane_b32 v21, s52, 13 -; GFX7-NEXT: v_writelane_b32 v21, s53, 14 -; GFX7-NEXT: v_writelane_b32 v21, s54, 15 +; GFX7-NEXT: v_writelane_b32 v21, s33, 0 +; GFX7-NEXT: v_writelane_b32 v21, s34, 1 +; GFX7-NEXT: v_writelane_b32 v21, s35, 2 +; GFX7-NEXT: v_writelane_b32 v21, s36, 3 +; GFX7-NEXT: v_writelane_b32 v21, s37, 4 +; GFX7-NEXT: v_writelane_b32 v21, s38, 5 +; GFX7-NEXT: v_writelane_b32 v21, s39, 6 +; GFX7-NEXT: v_writelane_b32 v21, s48, 7 +; GFX7-NEXT: v_writelane_b32 v21, s49, 8 +; GFX7-NEXT: v_writelane_b32 v21, s50, 9 +; GFX7-NEXT: v_writelane_b32 v21, s51, 10 +; GFX7-NEXT: v_writelane_b32 v21, s52, 11 +; GFX7-NEXT: v_writelane_b32 v21, s53, 12 +; GFX7-NEXT: v_writelane_b32 v21, s54, 13 +; GFX7-NEXT: v_writelane_b32 v21, s55, 14 +; GFX7-NEXT: v_writelane_b32 v21, s30, 15 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec -; GFX7-NEXT: v_writelane_b32 v21, s55, 16 +; GFX7-NEXT: v_writelane_b32 v21, s31, 16 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX7-NEXT: ;;#ASMEND @@ -640,23 +640,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s55, v21, 16 -; GFX7-NEXT: v_readlane_b32 s54, v21, 15 -; GFX7-NEXT: v_readlane_b32 s53, v21, 14 -; GFX7-NEXT: v_readlane_b32 s52, v21, 13 -; GFX7-NEXT: v_readlane_b32 s51, v21, 12 -; GFX7-NEXT: v_readlane_b32 s50, v21, 11 -; GFX7-NEXT: v_readlane_b32 s49, v21, 10 -; GFX7-NEXT: v_readlane_b32 s48, v21, 9 -; GFX7-NEXT: v_readlane_b32 s39, v21, 8 -; GFX7-NEXT: v_readlane_b32 s38, v21, 7 -; GFX7-NEXT: v_readlane_b32 s37, v21, 6 -; GFX7-NEXT: v_readlane_b32 s36, v21, 5 -; GFX7-NEXT: v_readlane_b32 s35, v21, 4 -; GFX7-NEXT: v_readlane_b32 s34, v21, 3 -; GFX7-NEXT: v_readlane_b32 s33, v21, 2 -; GFX7-NEXT: v_readlane_b32 s31, v21, 1 -; GFX7-NEXT: v_readlane_b32 s30, v21, 0 +; GFX7-NEXT: v_readlane_b32 s30, v21, 15 +; GFX7-NEXT: v_readlane_b32 s31, v21, 16 +; GFX7-NEXT: v_readlane_b32 s55, v21, 14 +; GFX7-NEXT: v_readlane_b32 s54, v21, 13 +; GFX7-NEXT: v_readlane_b32 s53, v21, 12 +; GFX7-NEXT: v_readlane_b32 s52, v21, 11 +; GFX7-NEXT: v_readlane_b32 s51, v21, 10 +; GFX7-NEXT: v_readlane_b32 s50, v21, 9 +; GFX7-NEXT: v_readlane_b32 s49, v21, 8 +; GFX7-NEXT: v_readlane_b32 s48, v21, 7 +; GFX7-NEXT: v_readlane_b32 s39, v21, 6 +; GFX7-NEXT: v_readlane_b32 s38, v21, 5 +; GFX7-NEXT: v_readlane_b32 s37, v21, 4 +; GFX7-NEXT: v_readlane_b32 s36, v21, 3 +; GFX7-NEXT: v_readlane_b32 s35, v21, 2 +; GFX7-NEXT: v_readlane_b32 s34, v21, 1 +; GFX7-NEXT: v_readlane_b32 s33, v21, 0 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: s_add_i32 s6, s32, 0x100400 ; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload @@ -671,24 +671,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX8-NEXT: s_add_i32 s6, s32, 0x100400 ; GFX8-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v21, s30, 0 -; GFX8-NEXT: v_writelane_b32 v21, s31, 1 -; GFX8-NEXT: v_writelane_b32 v21, s33, 2 -; GFX8-NEXT: v_writelane_b32 v21, s34, 3 -; GFX8-NEXT: v_writelane_b32 v21, s35, 4 -; GFX8-NEXT: v_writelane_b32 v21, s36, 5 -; GFX8-NEXT: v_writelane_b32 v21, s37, 6 -; GFX8-NEXT: v_writelane_b32 v21, s38, 7 -; GFX8-NEXT: v_writelane_b32 v21, s39, 8 -; GFX8-NEXT: v_writelane_b32 v21, s48, 9 -; GFX8-NEXT: v_writelane_b32 v21, s49, 10 -; GFX8-NEXT: v_writelane_b32 v21, s50, 11 -; GFX8-NEXT: v_writelane_b32 v21, s51, 12 -; GFX8-NEXT: v_writelane_b32 v21, s52, 13 -; GFX8-NEXT: v_writelane_b32 v21, s53, 14 -; GFX8-NEXT: v_writelane_b32 v21, s54, 15 +; GFX8-NEXT: v_writelane_b32 v21, s33, 0 +; GFX8-NEXT: v_writelane_b32 v21, s34, 1 +; GFX8-NEXT: v_writelane_b32 v21, s35, 2 +; GFX8-NEXT: v_writelane_b32 v21, s36, 3 +; GFX8-NEXT: v_writelane_b32 v21, s37, 4 +; GFX8-NEXT: v_writelane_b32 v21, s38, 5 +; GFX8-NEXT: v_writelane_b32 v21, s39, 6 +; GFX8-NEXT: v_writelane_b32 v21, s48, 7 +; GFX8-NEXT: v_writelane_b32 v21, s49, 8 +; GFX8-NEXT: v_writelane_b32 v21, s50, 9 +; GFX8-NEXT: v_writelane_b32 v21, s51, 10 +; GFX8-NEXT: v_writelane_b32 v21, s52, 11 +; GFX8-NEXT: v_writelane_b32 v21, s53, 12 +; GFX8-NEXT: v_writelane_b32 v21, s54, 13 +; GFX8-NEXT: v_writelane_b32 v21, s55, 14 +; GFX8-NEXT: v_writelane_b32 v21, s30, 15 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec -; GFX8-NEXT: v_writelane_b32 v21, s55, 16 +; GFX8-NEXT: v_writelane_b32 v21, s31, 16 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX8-NEXT: ;;#ASMEND @@ -699,23 +699,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s55, v21, 16 -; GFX8-NEXT: v_readlane_b32 s54, v21, 15 -; GFX8-NEXT: v_readlane_b32 s53, v21, 14 -; GFX8-NEXT: v_readlane_b32 s52, v21, 13 -; GFX8-NEXT: v_readlane_b32 s51, v21, 12 -; GFX8-NEXT: v_readlane_b32 s50, v21, 11 -; GFX8-NEXT: v_readlane_b32 s49, v21, 10 -; GFX8-NEXT: v_readlane_b32 s48, v21, 9 -; GFX8-NEXT: v_readlane_b32 s39, v21, 8 -; GFX8-NEXT: v_readlane_b32 s38, v21, 7 -; GFX8-NEXT: v_readlane_b32 s37, v21, 6 -; GFX8-NEXT: v_readlane_b32 s36, v21, 5 -; GFX8-NEXT: v_readlane_b32 s35, v21, 4 -; GFX8-NEXT: v_readlane_b32 s34, v21, 3 -; GFX8-NEXT: v_readlane_b32 s33, v21, 2 -; GFX8-NEXT: v_readlane_b32 s31, v21, 1 -; GFX8-NEXT: v_readlane_b32 s30, v21, 0 +; GFX8-NEXT: v_readlane_b32 s30, v21, 15 +; GFX8-NEXT: v_readlane_b32 s31, v21, 16 +; GFX8-NEXT: v_readlane_b32 s55, v21, 14 +; GFX8-NEXT: v_readlane_b32 s54, v21, 13 +; GFX8-NEXT: v_readlane_b32 s53, v21, 12 +; GFX8-NEXT: v_readlane_b32 s52, v21, 11 +; GFX8-NEXT: v_readlane_b32 s51, v21, 10 +; GFX8-NEXT: v_readlane_b32 s50, v21, 9 +; GFX8-NEXT: v_readlane_b32 s49, v21, 8 +; GFX8-NEXT: v_readlane_b32 s48, v21, 7 +; GFX8-NEXT: v_readlane_b32 s39, v21, 6 +; GFX8-NEXT: v_readlane_b32 s38, v21, 5 +; GFX8-NEXT: v_readlane_b32 s37, v21, 4 +; GFX8-NEXT: v_readlane_b32 s36, v21, 3 +; GFX8-NEXT: v_readlane_b32 s35, v21, 2 +; GFX8-NEXT: v_readlane_b32 s34, v21, 1 +; GFX8-NEXT: v_readlane_b32 s33, v21, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x100400 ; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload @@ -730,24 +730,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX900-NEXT: s_add_i32 s6, s32, 0x100400 ; GFX900-NEXT: buffer_store_dword v21, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v21, s30, 0 -; GFX900-NEXT: v_writelane_b32 v21, s31, 1 -; GFX900-NEXT: v_writelane_b32 v21, s33, 2 -; GFX900-NEXT: v_writelane_b32 v21, s34, 3 -; GFX900-NEXT: v_writelane_b32 v21, s35, 4 -; GFX900-NEXT: v_writelane_b32 v21, s36, 5 -; GFX900-NEXT: v_writelane_b32 v21, s37, 6 -; GFX900-NEXT: v_writelane_b32 v21, s38, 7 -; GFX900-NEXT: v_writelane_b32 v21, s39, 8 -; GFX900-NEXT: v_writelane_b32 v21, s48, 9 -; GFX900-NEXT: v_writelane_b32 v21, s49, 10 -; GFX900-NEXT: v_writelane_b32 v21, s50, 11 -; GFX900-NEXT: v_writelane_b32 v21, s51, 12 -; GFX900-NEXT: v_writelane_b32 v21, s52, 13 -; GFX900-NEXT: v_writelane_b32 v21, s53, 14 -; GFX900-NEXT: v_writelane_b32 v21, s54, 15 +; GFX900-NEXT: v_writelane_b32 v21, s33, 0 +; GFX900-NEXT: v_writelane_b32 v21, s34, 1 +; GFX900-NEXT: v_writelane_b32 v21, s35, 2 +; GFX900-NEXT: v_writelane_b32 v21, s36, 3 +; GFX900-NEXT: v_writelane_b32 v21, s37, 4 +; GFX900-NEXT: v_writelane_b32 v21, s38, 5 +; GFX900-NEXT: v_writelane_b32 v21, s39, 6 +; GFX900-NEXT: v_writelane_b32 v21, s48, 7 +; GFX900-NEXT: v_writelane_b32 v21, s49, 8 +; GFX900-NEXT: v_writelane_b32 v21, s50, 9 +; GFX900-NEXT: v_writelane_b32 v21, s51, 10 +; GFX900-NEXT: v_writelane_b32 v21, s52, 11 +; GFX900-NEXT: v_writelane_b32 v21, s53, 12 +; GFX900-NEXT: v_writelane_b32 v21, s54, 13 +; GFX900-NEXT: v_writelane_b32 v21, s55, 14 +; GFX900-NEXT: v_writelane_b32 v21, s30, 15 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec -; GFX900-NEXT: v_writelane_b32 v21, s55, 16 +; GFX900-NEXT: v_writelane_b32 v21, s31, 16 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX900-NEXT: ;;#ASMEND @@ -758,23 +758,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s55, v21, 16 -; GFX900-NEXT: v_readlane_b32 s54, v21, 15 -; GFX900-NEXT: v_readlane_b32 s53, v21, 14 -; GFX900-NEXT: v_readlane_b32 s52, v21, 13 -; GFX900-NEXT: v_readlane_b32 s51, v21, 12 -; GFX900-NEXT: v_readlane_b32 s50, v21, 11 -; GFX900-NEXT: v_readlane_b32 s49, v21, 10 -; GFX900-NEXT: v_readlane_b32 s48, v21, 9 -; GFX900-NEXT: v_readlane_b32 s39, v21, 8 -; GFX900-NEXT: v_readlane_b32 s38, v21, 7 -; GFX900-NEXT: v_readlane_b32 s37, v21, 6 -; GFX900-NEXT: v_readlane_b32 s36, v21, 5 -; GFX900-NEXT: v_readlane_b32 s35, v21, 4 -; GFX900-NEXT: v_readlane_b32 s34, v21, 3 -; GFX900-NEXT: v_readlane_b32 s33, v21, 2 -; GFX900-NEXT: v_readlane_b32 s31, v21, 1 -; GFX900-NEXT: v_readlane_b32 s30, v21, 0 +; GFX900-NEXT: v_readlane_b32 s30, v21, 15 +; GFX900-NEXT: v_readlane_b32 s31, v21, 16 +; GFX900-NEXT: v_readlane_b32 s55, v21, 14 +; GFX900-NEXT: v_readlane_b32 s54, v21, 13 +; GFX900-NEXT: v_readlane_b32 s53, v21, 12 +; GFX900-NEXT: v_readlane_b32 s52, v21, 11 +; GFX900-NEXT: v_readlane_b32 s51, v21, 10 +; GFX900-NEXT: v_readlane_b32 s50, v21, 9 +; GFX900-NEXT: v_readlane_b32 s49, v21, 8 +; GFX900-NEXT: v_readlane_b32 s48, v21, 7 +; GFX900-NEXT: v_readlane_b32 s39, v21, 6 +; GFX900-NEXT: v_readlane_b32 s38, v21, 5 +; GFX900-NEXT: v_readlane_b32 s37, v21, 4 +; GFX900-NEXT: v_readlane_b32 s36, v21, 3 +; GFX900-NEXT: v_readlane_b32 s35, v21, 2 +; GFX900-NEXT: v_readlane_b32 s34, v21, 1 +; GFX900-NEXT: v_readlane_b32 s33, v21, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x100400 ; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s6 ; 4-byte Folded Reload @@ -789,24 +789,25 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX942-NEXT: s_add_i32 s2, s32, 0x4010 ; GFX942-NEXT: scratch_store_dword off, v21, s2 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: v_writelane_b32 v21, s30, 0 -; GFX942-NEXT: v_writelane_b32 v21, s31, 1 -; GFX942-NEXT: v_writelane_b32 v21, s33, 2 -; GFX942-NEXT: v_writelane_b32 v21, s34, 3 -; GFX942-NEXT: v_writelane_b32 v21, s35, 4 -; GFX942-NEXT: v_writelane_b32 v21, s36, 5 -; GFX942-NEXT: v_writelane_b32 v21, s37, 6 -; GFX942-NEXT: v_writelane_b32 v21, s38, 7 -; GFX942-NEXT: v_writelane_b32 v21, s39, 8 -; GFX942-NEXT: v_writelane_b32 v21, s48, 9 -; GFX942-NEXT: v_writelane_b32 v21, s49, 10 -; GFX942-NEXT: v_writelane_b32 v21, s50, 11 -; GFX942-NEXT: v_writelane_b32 v21, s51, 12 -; GFX942-NEXT: v_writelane_b32 v21, s52, 13 -; GFX942-NEXT: v_writelane_b32 v21, s53, 14 -; GFX942-NEXT: v_writelane_b32 v21, s54, 15 +; GFX942-NEXT: v_writelane_b32 v21, s33, 0 +; GFX942-NEXT: v_writelane_b32 v21, s34, 1 +; GFX942-NEXT: v_writelane_b32 v21, s35, 2 +; GFX942-NEXT: v_writelane_b32 v21, s36, 3 +; GFX942-NEXT: v_writelane_b32 v21, s37, 4 +; GFX942-NEXT: v_writelane_b32 v21, s38, 5 +; GFX942-NEXT: v_writelane_b32 v21, s39, 6 +; GFX942-NEXT: v_writelane_b32 v21, s48, 7 +; GFX942-NEXT: v_writelane_b32 v21, s49, 8 +; GFX942-NEXT: v_writelane_b32 v21, s50, 9 +; GFX942-NEXT: v_writelane_b32 v21, s51, 10 +; GFX942-NEXT: v_writelane_b32 v21, s52, 11 +; GFX942-NEXT: v_writelane_b32 v21, s53, 12 +; GFX942-NEXT: v_writelane_b32 v21, s54, 13 +; GFX942-NEXT: v_writelane_b32 v21, s55, 14 +; GFX942-NEXT: v_writelane_b32 v21, s30, 15 ; GFX942-NEXT: s_and_b64 s[60:61], 0, exec -; GFX942-NEXT: v_writelane_b32 v21, s55, 16 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_writelane_b32 v21, s31, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX942-NEXT: ;;#ASMEND @@ -818,23 +819,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s55, v21, 16 -; GFX942-NEXT: v_readlane_b32 s54, v21, 15 -; GFX942-NEXT: v_readlane_b32 s53, v21, 14 -; GFX942-NEXT: v_readlane_b32 s52, v21, 13 -; GFX942-NEXT: v_readlane_b32 s51, v21, 12 -; GFX942-NEXT: v_readlane_b32 s50, v21, 11 -; GFX942-NEXT: v_readlane_b32 s49, v21, 10 -; GFX942-NEXT: v_readlane_b32 s48, v21, 9 -; GFX942-NEXT: v_readlane_b32 s39, v21, 8 -; GFX942-NEXT: v_readlane_b32 s38, v21, 7 -; GFX942-NEXT: v_readlane_b32 s37, v21, 6 -; GFX942-NEXT: v_readlane_b32 s36, v21, 5 -; GFX942-NEXT: v_readlane_b32 s35, v21, 4 -; GFX942-NEXT: v_readlane_b32 s34, v21, 3 -; GFX942-NEXT: v_readlane_b32 s33, v21, 2 -; GFX942-NEXT: v_readlane_b32 s31, v21, 1 -; GFX942-NEXT: v_readlane_b32 s30, v21, 0 +; GFX942-NEXT: v_readlane_b32 s30, v21, 15 +; GFX942-NEXT: v_readlane_b32 s31, v21, 16 +; GFX942-NEXT: v_readlane_b32 s55, v21, 14 +; GFX942-NEXT: v_readlane_b32 s54, v21, 13 +; GFX942-NEXT: v_readlane_b32 s53, v21, 12 +; GFX942-NEXT: v_readlane_b32 s52, v21, 11 +; GFX942-NEXT: v_readlane_b32 s51, v21, 10 +; GFX942-NEXT: v_readlane_b32 s50, v21, 9 +; GFX942-NEXT: v_readlane_b32 s49, v21, 8 +; GFX942-NEXT: v_readlane_b32 s48, v21, 7 +; GFX942-NEXT: v_readlane_b32 s39, v21, 6 +; GFX942-NEXT: v_readlane_b32 s38, v21, 5 +; GFX942-NEXT: v_readlane_b32 s37, v21, 4 +; GFX942-NEXT: v_readlane_b32 s36, v21, 3 +; GFX942-NEXT: v_readlane_b32 s35, v21, 2 +; GFX942-NEXT: v_readlane_b32 s34, v21, 1 +; GFX942-NEXT: v_readlane_b32 s33, v21, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: s_add_i32 s2, s32, 0x4010 ; GFX942-NEXT: scratch_load_dword v21, off, s2 ; 4-byte Folded Reload @@ -850,24 +851,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_1-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v21, s30, 0 +; GFX10_1-NEXT: v_writelane_b32 v21, s33, 0 ; GFX10_1-NEXT: s_and_b32 s59, 0, exec_lo -; GFX10_1-NEXT: v_writelane_b32 v21, s31, 1 -; GFX10_1-NEXT: v_writelane_b32 v21, s33, 2 -; GFX10_1-NEXT: v_writelane_b32 v21, s34, 3 -; GFX10_1-NEXT: v_writelane_b32 v21, s35, 4 -; GFX10_1-NEXT: v_writelane_b32 v21, s36, 5 -; GFX10_1-NEXT: v_writelane_b32 v21, s37, 6 -; GFX10_1-NEXT: v_writelane_b32 v21, s38, 7 -; GFX10_1-NEXT: v_writelane_b32 v21, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v21, s48, 9 -; GFX10_1-NEXT: v_writelane_b32 v21, s49, 10 -; GFX10_1-NEXT: v_writelane_b32 v21, s50, 11 -; GFX10_1-NEXT: v_writelane_b32 v21, s51, 12 -; GFX10_1-NEXT: v_writelane_b32 v21, s52, 13 -; GFX10_1-NEXT: v_writelane_b32 v21, s53, 14 -; GFX10_1-NEXT: v_writelane_b32 v21, s54, 15 -; GFX10_1-NEXT: v_writelane_b32 v21, s55, 16 +; GFX10_1-NEXT: v_writelane_b32 v21, s34, 1 +; GFX10_1-NEXT: v_writelane_b32 v21, s35, 2 +; GFX10_1-NEXT: v_writelane_b32 v21, s36, 3 +; GFX10_1-NEXT: v_writelane_b32 v21, s37, 4 +; GFX10_1-NEXT: v_writelane_b32 v21, s38, 5 +; GFX10_1-NEXT: v_writelane_b32 v21, s39, 6 +; GFX10_1-NEXT: v_writelane_b32 v21, s48, 7 +; GFX10_1-NEXT: v_writelane_b32 v21, s49, 8 +; GFX10_1-NEXT: v_writelane_b32 v21, s50, 9 +; GFX10_1-NEXT: v_writelane_b32 v21, s51, 10 +; GFX10_1-NEXT: v_writelane_b32 v21, s52, 11 +; GFX10_1-NEXT: v_writelane_b32 v21, s53, 12 +; GFX10_1-NEXT: v_writelane_b32 v21, s54, 13 +; GFX10_1-NEXT: v_writelane_b32 v21, s55, 14 +; GFX10_1-NEXT: v_writelane_b32 v21, s30, 15 +; GFX10_1-NEXT: v_writelane_b32 v21, s31, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX10_1-NEXT: ;;#ASMEND @@ -878,23 +879,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s55, v21, 16 -; GFX10_1-NEXT: v_readlane_b32 s54, v21, 15 -; GFX10_1-NEXT: v_readlane_b32 s53, v21, 14 -; GFX10_1-NEXT: v_readlane_b32 s52, v21, 13 -; GFX10_1-NEXT: v_readlane_b32 s51, v21, 12 -; GFX10_1-NEXT: v_readlane_b32 s50, v21, 11 -; GFX10_1-NEXT: v_readlane_b32 s49, v21, 10 -; GFX10_1-NEXT: v_readlane_b32 s48, v21, 9 -; GFX10_1-NEXT: v_readlane_b32 s39, v21, 8 -; GFX10_1-NEXT: v_readlane_b32 s38, v21, 7 -; GFX10_1-NEXT: v_readlane_b32 s37, v21, 6 -; GFX10_1-NEXT: v_readlane_b32 s36, v21, 5 -; GFX10_1-NEXT: v_readlane_b32 s35, v21, 4 -; GFX10_1-NEXT: v_readlane_b32 s34, v21, 3 -; GFX10_1-NEXT: v_readlane_b32 s33, v21, 2 -; GFX10_1-NEXT: v_readlane_b32 s31, v21, 1 -; GFX10_1-NEXT: v_readlane_b32 s30, v21, 0 +; GFX10_1-NEXT: v_readlane_b32 s30, v21, 15 +; GFX10_1-NEXT: v_readlane_b32 s31, v21, 16 +; GFX10_1-NEXT: v_readlane_b32 s55, v21, 14 +; GFX10_1-NEXT: v_readlane_b32 s54, v21, 13 +; GFX10_1-NEXT: v_readlane_b32 s53, v21, 12 +; GFX10_1-NEXT: v_readlane_b32 s52, v21, 11 +; GFX10_1-NEXT: v_readlane_b32 s51, v21, 10 +; GFX10_1-NEXT: v_readlane_b32 s50, v21, 9 +; GFX10_1-NEXT: v_readlane_b32 s49, v21, 8 +; GFX10_1-NEXT: v_readlane_b32 s48, v21, 7 +; GFX10_1-NEXT: v_readlane_b32 s39, v21, 6 +; GFX10_1-NEXT: v_readlane_b32 s38, v21, 5 +; GFX10_1-NEXT: v_readlane_b32 s37, v21, 4 +; GFX10_1-NEXT: v_readlane_b32 s36, v21, 3 +; GFX10_1-NEXT: v_readlane_b32 s35, v21, 2 +; GFX10_1-NEXT: v_readlane_b32 s34, v21, 1 +; GFX10_1-NEXT: v_readlane_b32 s33, v21, 0 ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x80200 ; GFX10_1-NEXT: buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload @@ -910,24 +911,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80200 ; GFX10_3-NEXT: buffer_store_dword v21, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v21, s30, 0 +; GFX10_3-NEXT: v_writelane_b32 v21, s33, 0 ; GFX10_3-NEXT: s_and_b32 s59, 0, exec_lo -; GFX10_3-NEXT: v_writelane_b32 v21, s31, 1 -; GFX10_3-NEXT: v_writelane_b32 v21, s33, 2 -; GFX10_3-NEXT: v_writelane_b32 v21, s34, 3 -; GFX10_3-NEXT: v_writelane_b32 v21, s35, 4 -; GFX10_3-NEXT: v_writelane_b32 v21, s36, 5 -; GFX10_3-NEXT: v_writelane_b32 v21, s37, 6 -; GFX10_3-NEXT: v_writelane_b32 v21, s38, 7 -; GFX10_3-NEXT: v_writelane_b32 v21, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v21, s48, 9 -; GFX10_3-NEXT: v_writelane_b32 v21, s49, 10 -; GFX10_3-NEXT: v_writelane_b32 v21, s50, 11 -; GFX10_3-NEXT: v_writelane_b32 v21, s51, 12 -; GFX10_3-NEXT: v_writelane_b32 v21, s52, 13 -; GFX10_3-NEXT: v_writelane_b32 v21, s53, 14 -; GFX10_3-NEXT: v_writelane_b32 v21, s54, 15 -; GFX10_3-NEXT: v_writelane_b32 v21, s55, 16 +; GFX10_3-NEXT: v_writelane_b32 v21, s34, 1 +; GFX10_3-NEXT: v_writelane_b32 v21, s35, 2 +; GFX10_3-NEXT: v_writelane_b32 v21, s36, 3 +; GFX10_3-NEXT: v_writelane_b32 v21, s37, 4 +; GFX10_3-NEXT: v_writelane_b32 v21, s38, 5 +; GFX10_3-NEXT: v_writelane_b32 v21, s39, 6 +; GFX10_3-NEXT: v_writelane_b32 v21, s48, 7 +; GFX10_3-NEXT: v_writelane_b32 v21, s49, 8 +; GFX10_3-NEXT: v_writelane_b32 v21, s50, 9 +; GFX10_3-NEXT: v_writelane_b32 v21, s51, 10 +; GFX10_3-NEXT: v_writelane_b32 v21, s52, 11 +; GFX10_3-NEXT: v_writelane_b32 v21, s53, 12 +; GFX10_3-NEXT: v_writelane_b32 v21, s54, 13 +; GFX10_3-NEXT: v_writelane_b32 v21, s55, 14 +; GFX10_3-NEXT: v_writelane_b32 v21, s30, 15 +; GFX10_3-NEXT: v_writelane_b32 v21, s31, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX10_3-NEXT: ;;#ASMEND @@ -938,23 +939,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s55, v21, 16 -; GFX10_3-NEXT: v_readlane_b32 s54, v21, 15 -; GFX10_3-NEXT: v_readlane_b32 s53, v21, 14 -; GFX10_3-NEXT: v_readlane_b32 s52, v21, 13 -; GFX10_3-NEXT: v_readlane_b32 s51, v21, 12 -; GFX10_3-NEXT: v_readlane_b32 s50, v21, 11 -; GFX10_3-NEXT: v_readlane_b32 s49, v21, 10 -; GFX10_3-NEXT: v_readlane_b32 s48, v21, 9 -; GFX10_3-NEXT: v_readlane_b32 s39, v21, 8 -; GFX10_3-NEXT: v_readlane_b32 s38, v21, 7 -; GFX10_3-NEXT: v_readlane_b32 s37, v21, 6 -; GFX10_3-NEXT: v_readlane_b32 s36, v21, 5 -; GFX10_3-NEXT: v_readlane_b32 s35, v21, 4 -; GFX10_3-NEXT: v_readlane_b32 s34, v21, 3 -; GFX10_3-NEXT: v_readlane_b32 s33, v21, 2 -; GFX10_3-NEXT: v_readlane_b32 s31, v21, 1 -; GFX10_3-NEXT: v_readlane_b32 s30, v21, 0 +; GFX10_3-NEXT: v_readlane_b32 s30, v21, 15 +; GFX10_3-NEXT: v_readlane_b32 s31, v21, 16 +; GFX10_3-NEXT: v_readlane_b32 s55, v21, 14 +; GFX10_3-NEXT: v_readlane_b32 s54, v21, 13 +; GFX10_3-NEXT: v_readlane_b32 s53, v21, 12 +; GFX10_3-NEXT: v_readlane_b32 s52, v21, 11 +; GFX10_3-NEXT: v_readlane_b32 s51, v21, 10 +; GFX10_3-NEXT: v_readlane_b32 s50, v21, 9 +; GFX10_3-NEXT: v_readlane_b32 s49, v21, 8 +; GFX10_3-NEXT: v_readlane_b32 s48, v21, 7 +; GFX10_3-NEXT: v_readlane_b32 s39, v21, 6 +; GFX10_3-NEXT: v_readlane_b32 s38, v21, 5 +; GFX10_3-NEXT: v_readlane_b32 s37, v21, 4 +; GFX10_3-NEXT: v_readlane_b32 s36, v21, 3 +; GFX10_3-NEXT: v_readlane_b32 s35, v21, 2 +; GFX10_3-NEXT: v_readlane_b32 s34, v21, 1 +; GFX10_3-NEXT: v_readlane_b32 s33, v21, 0 ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x80200 ; GFX10_3-NEXT: buffer_load_dword v21, off, s[0:3], s5 ; 4-byte Folded Reload @@ -969,24 +970,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX11-NEXT: s_add_i32 s1, s32, 0x4010 ; GFX11-NEXT: scratch_store_b32 off, v21, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v21, s30, 0 +; GFX11-NEXT: v_writelane_b32 v21, s33, 0 ; GFX11-NEXT: s_and_b32 s59, 0, exec_lo -; GFX11-NEXT: v_writelane_b32 v21, s31, 1 -; GFX11-NEXT: v_writelane_b32 v21, s33, 2 -; GFX11-NEXT: v_writelane_b32 v21, s34, 3 -; GFX11-NEXT: v_writelane_b32 v21, s35, 4 -; GFX11-NEXT: v_writelane_b32 v21, s36, 5 -; GFX11-NEXT: v_writelane_b32 v21, s37, 6 -; GFX11-NEXT: v_writelane_b32 v21, s38, 7 -; GFX11-NEXT: v_writelane_b32 v21, s39, 8 -; GFX11-NEXT: v_writelane_b32 v21, s48, 9 -; GFX11-NEXT: v_writelane_b32 v21, s49, 10 -; GFX11-NEXT: v_writelane_b32 v21, s50, 11 -; GFX11-NEXT: v_writelane_b32 v21, s51, 12 -; GFX11-NEXT: v_writelane_b32 v21, s52, 13 -; GFX11-NEXT: v_writelane_b32 v21, s53, 14 -; GFX11-NEXT: v_writelane_b32 v21, s54, 15 -; GFX11-NEXT: v_writelane_b32 v21, s55, 16 +; GFX11-NEXT: v_writelane_b32 v21, s34, 1 +; GFX11-NEXT: v_writelane_b32 v21, s35, 2 +; GFX11-NEXT: v_writelane_b32 v21, s36, 3 +; GFX11-NEXT: v_writelane_b32 v21, s37, 4 +; GFX11-NEXT: v_writelane_b32 v21, s38, 5 +; GFX11-NEXT: v_writelane_b32 v21, s39, 6 +; GFX11-NEXT: v_writelane_b32 v21, s48, 7 +; GFX11-NEXT: v_writelane_b32 v21, s49, 8 +; GFX11-NEXT: v_writelane_b32 v21, s50, 9 +; GFX11-NEXT: v_writelane_b32 v21, s51, 10 +; GFX11-NEXT: v_writelane_b32 v21, s52, 11 +; GFX11-NEXT: v_writelane_b32 v21, s53, 12 +; GFX11-NEXT: v_writelane_b32 v21, s54, 13 +; GFX11-NEXT: v_writelane_b32 v21, s55, 14 +; GFX11-NEXT: v_writelane_b32 v21, s30, 15 +; GFX11-NEXT: v_writelane_b32 v21, s31, 16 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX11-NEXT: ;;#ASMEND @@ -999,23 +1000,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s55, v21, 16 -; GFX11-NEXT: v_readlane_b32 s54, v21, 15 -; GFX11-NEXT: v_readlane_b32 s53, v21, 14 -; GFX11-NEXT: v_readlane_b32 s52, v21, 13 -; GFX11-NEXT: v_readlane_b32 s51, v21, 12 -; GFX11-NEXT: v_readlane_b32 s50, v21, 11 -; GFX11-NEXT: v_readlane_b32 s49, v21, 10 -; GFX11-NEXT: v_readlane_b32 s48, v21, 9 -; GFX11-NEXT: v_readlane_b32 s39, v21, 8 -; GFX11-NEXT: v_readlane_b32 s38, v21, 7 -; GFX11-NEXT: v_readlane_b32 s37, v21, 6 -; GFX11-NEXT: v_readlane_b32 s36, v21, 5 -; GFX11-NEXT: v_readlane_b32 s35, v21, 4 -; GFX11-NEXT: v_readlane_b32 s34, v21, 3 -; GFX11-NEXT: v_readlane_b32 s33, v21, 2 -; GFX11-NEXT: v_readlane_b32 s31, v21, 1 -; GFX11-NEXT: v_readlane_b32 s30, v21, 0 +; GFX11-NEXT: v_readlane_b32 s30, v21, 15 +; GFX11-NEXT: v_readlane_b32 s31, v21, 16 +; GFX11-NEXT: v_readlane_b32 s55, v21, 14 +; GFX11-NEXT: v_readlane_b32 s54, v21, 13 +; GFX11-NEXT: v_readlane_b32 s53, v21, 12 +; GFX11-NEXT: v_readlane_b32 s52, v21, 11 +; GFX11-NEXT: v_readlane_b32 s51, v21, 10 +; GFX11-NEXT: v_readlane_b32 s50, v21, 9 +; GFX11-NEXT: v_readlane_b32 s49, v21, 8 +; GFX11-NEXT: v_readlane_b32 s48, v21, 7 +; GFX11-NEXT: v_readlane_b32 s39, v21, 6 +; GFX11-NEXT: v_readlane_b32 s38, v21, 5 +; GFX11-NEXT: v_readlane_b32 s37, v21, 4 +; GFX11-NEXT: v_readlane_b32 s36, v21, 3 +; GFX11-NEXT: v_readlane_b32 s35, v21, 2 +; GFX11-NEXT: v_readlane_b32 s34, v21, 1 +; GFX11-NEXT: v_readlane_b32 s33, v21, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x4010 ; GFX11-NEXT: scratch_load_b32 v21, off, s1 ; 4-byte Folded Reload @@ -1034,24 +1035,24 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: scratch_store_b32 off, v21, s32 offset:16384 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v21, s30, 0 +; GFX12-NEXT: v_writelane_b32 v21, s33, 0 ; GFX12-NEXT: s_and_b32 s59, 0, exec_lo -; GFX12-NEXT: v_writelane_b32 v21, s31, 1 -; GFX12-NEXT: v_writelane_b32 v21, s33, 2 -; GFX12-NEXT: v_writelane_b32 v21, s34, 3 -; GFX12-NEXT: v_writelane_b32 v21, s35, 4 -; GFX12-NEXT: v_writelane_b32 v21, s36, 5 -; GFX12-NEXT: v_writelane_b32 v21, s37, 6 -; GFX12-NEXT: v_writelane_b32 v21, s38, 7 -; GFX12-NEXT: v_writelane_b32 v21, s39, 8 -; GFX12-NEXT: v_writelane_b32 v21, s48, 9 -; GFX12-NEXT: v_writelane_b32 v21, s49, 10 -; GFX12-NEXT: v_writelane_b32 v21, s50, 11 -; GFX12-NEXT: v_writelane_b32 v21, s51, 12 -; GFX12-NEXT: v_writelane_b32 v21, s52, 13 -; GFX12-NEXT: v_writelane_b32 v21, s53, 14 -; GFX12-NEXT: v_writelane_b32 v21, s54, 15 -; GFX12-NEXT: v_writelane_b32 v21, s55, 16 +; GFX12-NEXT: v_writelane_b32 v21, s34, 1 +; GFX12-NEXT: v_writelane_b32 v21, s35, 2 +; GFX12-NEXT: v_writelane_b32 v21, s36, 3 +; GFX12-NEXT: v_writelane_b32 v21, s37, 4 +; GFX12-NEXT: v_writelane_b32 v21, s38, 5 +; GFX12-NEXT: v_writelane_b32 v21, s39, 6 +; GFX12-NEXT: v_writelane_b32 v21, s48, 7 +; GFX12-NEXT: v_writelane_b32 v21, s49, 8 +; GFX12-NEXT: v_writelane_b32 v21, s50, 9 +; GFX12-NEXT: v_writelane_b32 v21, s51, 10 +; GFX12-NEXT: v_writelane_b32 v21, s52, 11 +; GFX12-NEXT: v_writelane_b32 v21, s53, 12 +; GFX12-NEXT: v_writelane_b32 v21, s54, 13 +; GFX12-NEXT: v_writelane_b32 v21, s55, 14 +; GFX12-NEXT: v_writelane_b32 v21, s30, 15 +; GFX12-NEXT: v_writelane_b32 v21, s31, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc ; GFX12-NEXT: ;;#ASMEND @@ -1061,23 +1062,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc, s54, scc ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readlane_b32 s55, v21, 16 -; GFX12-NEXT: v_readlane_b32 s54, v21, 15 -; GFX12-NEXT: v_readlane_b32 s53, v21, 14 -; GFX12-NEXT: v_readlane_b32 s52, v21, 13 -; GFX12-NEXT: v_readlane_b32 s51, v21, 12 -; GFX12-NEXT: v_readlane_b32 s50, v21, 11 -; GFX12-NEXT: v_readlane_b32 s49, v21, 10 -; GFX12-NEXT: v_readlane_b32 s48, v21, 9 -; GFX12-NEXT: v_readlane_b32 s39, v21, 8 -; GFX12-NEXT: v_readlane_b32 s38, v21, 7 -; GFX12-NEXT: v_readlane_b32 s37, v21, 6 -; GFX12-NEXT: v_readlane_b32 s36, v21, 5 -; GFX12-NEXT: v_readlane_b32 s35, v21, 4 -; GFX12-NEXT: v_readlane_b32 s34, v21, 3 -; GFX12-NEXT: v_readlane_b32 s33, v21, 2 -; GFX12-NEXT: v_readlane_b32 s31, v21, 1 -; GFX12-NEXT: v_readlane_b32 s30, v21, 0 +; GFX12-NEXT: v_readlane_b32 s30, v21, 15 +; GFX12-NEXT: v_readlane_b32 s31, v21, 16 +; GFX12-NEXT: v_readlane_b32 s55, v21, 14 +; GFX12-NEXT: v_readlane_b32 s54, v21, 13 +; GFX12-NEXT: v_readlane_b32 s53, v21, 12 +; GFX12-NEXT: v_readlane_b32 s52, v21, 11 +; GFX12-NEXT: v_readlane_b32 s51, v21, 10 +; GFX12-NEXT: v_readlane_b32 s50, v21, 9 +; GFX12-NEXT: v_readlane_b32 s49, v21, 8 +; GFX12-NEXT: v_readlane_b32 s48, v21, 7 +; GFX12-NEXT: v_readlane_b32 s39, v21, 6 +; GFX12-NEXT: v_readlane_b32 s38, v21, 5 +; GFX12-NEXT: v_readlane_b32 s37, v21, 4 +; GFX12-NEXT: v_readlane_b32 s36, v21, 3 +; GFX12-NEXT: v_readlane_b32 s35, v21, 2 +; GFX12-NEXT: v_readlane_b32 s34, v21, 1 +; GFX12-NEXT: v_readlane_b32 s33, v21, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v21, off, s32 offset:16384 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe @@ -1135,30 +1136,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v23, s28, 17 ; GFX7-NEXT: v_writelane_b32 v23, s29, 18 -; GFX7-NEXT: v_writelane_b32 v23, s30, 0 -; GFX7-NEXT: v_writelane_b32 v23, s31, 1 -; GFX7-NEXT: v_writelane_b32 v23, s33, 2 -; GFX7-NEXT: v_writelane_b32 v23, s34, 3 -; GFX7-NEXT: v_writelane_b32 v23, s35, 4 -; GFX7-NEXT: v_writelane_b32 v23, s36, 5 -; GFX7-NEXT: v_writelane_b32 v23, s37, 6 -; GFX7-NEXT: v_writelane_b32 v23, s38, 7 -; GFX7-NEXT: v_writelane_b32 v23, s39, 8 -; GFX7-NEXT: v_writelane_b32 v23, s48, 9 -; GFX7-NEXT: v_writelane_b32 v23, s49, 10 -; GFX7-NEXT: v_writelane_b32 v23, s50, 11 -; GFX7-NEXT: v_writelane_b32 v23, s51, 12 -; GFX7-NEXT: v_writelane_b32 v23, s52, 13 +; GFX7-NEXT: v_writelane_b32 v23, s33, 0 +; GFX7-NEXT: v_writelane_b32 v23, s34, 1 +; GFX7-NEXT: v_writelane_b32 v23, s35, 2 +; GFX7-NEXT: v_writelane_b32 v23, s36, 3 +; GFX7-NEXT: v_writelane_b32 v23, s37, 4 +; GFX7-NEXT: v_writelane_b32 v23, s38, 5 +; GFX7-NEXT: v_writelane_b32 v23, s39, 6 +; GFX7-NEXT: v_writelane_b32 v23, s48, 7 +; GFX7-NEXT: v_writelane_b32 v23, s49, 8 +; GFX7-NEXT: v_writelane_b32 v23, s50, 9 +; GFX7-NEXT: v_writelane_b32 v23, s51, 10 +; GFX7-NEXT: v_writelane_b32 v23, s52, 11 +; GFX7-NEXT: v_writelane_b32 v23, s53, 12 +; GFX7-NEXT: v_writelane_b32 v23, s54, 13 ; GFX7-NEXT: s_lshr_b32 s5, s32, 6 -; GFX7-NEXT: v_writelane_b32 v23, s53, 14 +; GFX7-NEXT: v_writelane_b32 v23, s55, 14 ; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 ; GFX7-NEXT: s_add_i32 s4, s5, 0x4240 ; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX7-NEXT: v_writelane_b32 v23, s54, 15 +; GFX7-NEXT: v_writelane_b32 v23, s30, 15 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 ; GFX7-NEXT: v_writelane_b32 v22, s4, 0 ; GFX7-NEXT: s_and_b64 s[4:5], 0, exec -; GFX7-NEXT: v_writelane_b32 v23, s55, 16 +; GFX7-NEXT: v_writelane_b32 v23, s31, 16 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use alloca0 v0 ; GFX7-NEXT: ;;#ASMEND @@ -1169,23 +1170,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: v_readlane_b32 s55, v23, 16 -; GFX7-NEXT: v_readlane_b32 s54, v23, 15 -; GFX7-NEXT: v_readlane_b32 s53, v23, 14 -; GFX7-NEXT: v_readlane_b32 s52, v23, 13 -; GFX7-NEXT: v_readlane_b32 s51, v23, 12 -; GFX7-NEXT: v_readlane_b32 s50, v23, 11 -; GFX7-NEXT: v_readlane_b32 s49, v23, 10 -; GFX7-NEXT: v_readlane_b32 s48, v23, 9 -; GFX7-NEXT: v_readlane_b32 s39, v23, 8 -; GFX7-NEXT: v_readlane_b32 s38, v23, 7 -; GFX7-NEXT: v_readlane_b32 s37, v23, 6 -; GFX7-NEXT: v_readlane_b32 s36, v23, 5 -; GFX7-NEXT: v_readlane_b32 s35, v23, 4 -; GFX7-NEXT: v_readlane_b32 s34, v23, 3 -; GFX7-NEXT: v_readlane_b32 s33, v23, 2 -; GFX7-NEXT: v_readlane_b32 s31, v23, 1 -; GFX7-NEXT: v_readlane_b32 s30, v23, 0 +; GFX7-NEXT: v_readlane_b32 s30, v23, 15 +; GFX7-NEXT: v_readlane_b32 s31, v23, 16 +; GFX7-NEXT: v_readlane_b32 s55, v23, 14 +; GFX7-NEXT: v_readlane_b32 s54, v23, 13 +; GFX7-NEXT: v_readlane_b32 s53, v23, 12 +; GFX7-NEXT: v_readlane_b32 s52, v23, 11 +; GFX7-NEXT: v_readlane_b32 s51, v23, 10 +; GFX7-NEXT: v_readlane_b32 s50, v23, 9 +; GFX7-NEXT: v_readlane_b32 s49, v23, 8 +; GFX7-NEXT: v_readlane_b32 s48, v23, 7 +; GFX7-NEXT: v_readlane_b32 s39, v23, 6 +; GFX7-NEXT: v_readlane_b32 s38, v23, 5 +; GFX7-NEXT: v_readlane_b32 s37, v23, 4 +; GFX7-NEXT: v_readlane_b32 s36, v23, 3 +; GFX7-NEXT: v_readlane_b32 s35, v23, 2 +; GFX7-NEXT: v_readlane_b32 s34, v23, 1 +; GFX7-NEXT: v_readlane_b32 s33, v23, 0 ; GFX7-NEXT: v_readlane_b32 s28, v23, 17 ; GFX7-NEXT: v_readlane_b32 s29, v23, 18 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 @@ -1206,30 +1207,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: s_add_i32 s6, s32, 0x201100 ; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v23, s30, 0 -; GFX8-NEXT: v_writelane_b32 v23, s31, 1 -; GFX8-NEXT: v_writelane_b32 v23, s33, 2 -; GFX8-NEXT: v_writelane_b32 v23, s34, 3 -; GFX8-NEXT: v_writelane_b32 v23, s35, 4 -; GFX8-NEXT: v_writelane_b32 v23, s36, 5 -; GFX8-NEXT: v_writelane_b32 v23, s37, 6 -; GFX8-NEXT: v_writelane_b32 v23, s38, 7 -; GFX8-NEXT: v_writelane_b32 v23, s39, 8 -; GFX8-NEXT: v_writelane_b32 v23, s48, 9 -; GFX8-NEXT: v_writelane_b32 v23, s49, 10 -; GFX8-NEXT: v_writelane_b32 v23, s50, 11 -; GFX8-NEXT: v_writelane_b32 v23, s51, 12 -; GFX8-NEXT: v_writelane_b32 v23, s52, 13 +; GFX8-NEXT: v_writelane_b32 v23, s33, 0 +; GFX8-NEXT: v_writelane_b32 v23, s34, 1 +; GFX8-NEXT: v_writelane_b32 v23, s35, 2 +; GFX8-NEXT: v_writelane_b32 v23, s36, 3 +; GFX8-NEXT: v_writelane_b32 v23, s37, 4 +; GFX8-NEXT: v_writelane_b32 v23, s38, 5 +; GFX8-NEXT: v_writelane_b32 v23, s39, 6 +; GFX8-NEXT: v_writelane_b32 v23, s48, 7 +; GFX8-NEXT: v_writelane_b32 v23, s49, 8 +; GFX8-NEXT: v_writelane_b32 v23, s50, 9 +; GFX8-NEXT: v_writelane_b32 v23, s51, 10 +; GFX8-NEXT: v_writelane_b32 v23, s52, 11 +; GFX8-NEXT: v_writelane_b32 v23, s53, 12 +; GFX8-NEXT: v_writelane_b32 v23, s54, 13 ; GFX8-NEXT: s_lshr_b32 s5, s32, 6 -; GFX8-NEXT: v_writelane_b32 v23, s53, 14 +; GFX8-NEXT: v_writelane_b32 v23, s55, 14 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX8-NEXT: s_add_i32 s4, s5, 0x4240 ; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX8-NEXT: v_writelane_b32 v23, s54, 15 +; GFX8-NEXT: v_writelane_b32 v23, s30, 15 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: v_writelane_b32 v22, s4, 0 ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec -; GFX8-NEXT: v_writelane_b32 v23, s55, 16 +; GFX8-NEXT: v_writelane_b32 v23, s31, 16 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND @@ -1241,23 +1242,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s55, v23, 16 -; GFX8-NEXT: v_readlane_b32 s54, v23, 15 -; GFX8-NEXT: v_readlane_b32 s53, v23, 14 -; GFX8-NEXT: v_readlane_b32 s52, v23, 13 -; GFX8-NEXT: v_readlane_b32 s51, v23, 12 -; GFX8-NEXT: v_readlane_b32 s50, v23, 11 -; GFX8-NEXT: v_readlane_b32 s49, v23, 10 -; GFX8-NEXT: v_readlane_b32 s48, v23, 9 -; GFX8-NEXT: v_readlane_b32 s39, v23, 8 -; GFX8-NEXT: v_readlane_b32 s38, v23, 7 -; GFX8-NEXT: v_readlane_b32 s37, v23, 6 -; GFX8-NEXT: v_readlane_b32 s36, v23, 5 -; GFX8-NEXT: v_readlane_b32 s35, v23, 4 -; GFX8-NEXT: v_readlane_b32 s34, v23, 3 -; GFX8-NEXT: v_readlane_b32 s33, v23, 2 -; GFX8-NEXT: v_readlane_b32 s31, v23, 1 -; GFX8-NEXT: v_readlane_b32 s30, v23, 0 +; GFX8-NEXT: v_readlane_b32 s30, v23, 15 +; GFX8-NEXT: v_readlane_b32 s31, v23, 16 +; GFX8-NEXT: v_readlane_b32 s55, v23, 14 +; GFX8-NEXT: v_readlane_b32 s54, v23, 13 +; GFX8-NEXT: v_readlane_b32 s53, v23, 12 +; GFX8-NEXT: v_readlane_b32 s52, v23, 11 +; GFX8-NEXT: v_readlane_b32 s51, v23, 10 +; GFX8-NEXT: v_readlane_b32 s50, v23, 9 +; GFX8-NEXT: v_readlane_b32 s49, v23, 8 +; GFX8-NEXT: v_readlane_b32 s48, v23, 7 +; GFX8-NEXT: v_readlane_b32 s39, v23, 6 +; GFX8-NEXT: v_readlane_b32 s38, v23, 5 +; GFX8-NEXT: v_readlane_b32 s37, v23, 4 +; GFX8-NEXT: v_readlane_b32 s36, v23, 3 +; GFX8-NEXT: v_readlane_b32 s35, v23, 2 +; GFX8-NEXT: v_readlane_b32 s34, v23, 1 +; GFX8-NEXT: v_readlane_b32 s33, v23, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -1276,30 +1277,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: s_add_i32 s6, s32, 0x201100 ; GFX900-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_writelane_b32 v23, s30, 0 -; GFX900-NEXT: v_writelane_b32 v23, s31, 1 -; GFX900-NEXT: v_writelane_b32 v23, s33, 2 -; GFX900-NEXT: v_writelane_b32 v23, s34, 3 -; GFX900-NEXT: v_writelane_b32 v23, s35, 4 -; GFX900-NEXT: v_writelane_b32 v23, s36, 5 -; GFX900-NEXT: v_writelane_b32 v23, s37, 6 -; GFX900-NEXT: v_writelane_b32 v23, s38, 7 -; GFX900-NEXT: v_writelane_b32 v23, s39, 8 -; GFX900-NEXT: v_writelane_b32 v23, s48, 9 -; GFX900-NEXT: v_writelane_b32 v23, s49, 10 -; GFX900-NEXT: v_writelane_b32 v23, s50, 11 -; GFX900-NEXT: v_writelane_b32 v23, s51, 12 -; GFX900-NEXT: v_writelane_b32 v23, s52, 13 +; GFX900-NEXT: v_writelane_b32 v23, s33, 0 +; GFX900-NEXT: v_writelane_b32 v23, s34, 1 +; GFX900-NEXT: v_writelane_b32 v23, s35, 2 +; GFX900-NEXT: v_writelane_b32 v23, s36, 3 +; GFX900-NEXT: v_writelane_b32 v23, s37, 4 +; GFX900-NEXT: v_writelane_b32 v23, s38, 5 +; GFX900-NEXT: v_writelane_b32 v23, s39, 6 +; GFX900-NEXT: v_writelane_b32 v23, s48, 7 +; GFX900-NEXT: v_writelane_b32 v23, s49, 8 +; GFX900-NEXT: v_writelane_b32 v23, s50, 9 +; GFX900-NEXT: v_writelane_b32 v23, s51, 10 +; GFX900-NEXT: v_writelane_b32 v23, s52, 11 +; GFX900-NEXT: v_writelane_b32 v23, s53, 12 +; GFX900-NEXT: v_writelane_b32 v23, s54, 13 ; GFX900-NEXT: s_lshr_b32 s5, s32, 6 -; GFX900-NEXT: v_writelane_b32 v23, s53, 14 +; GFX900-NEXT: v_writelane_b32 v23, s55, 14 ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: s_add_i32 s4, s5, 0x4240 ; GFX900-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX900-NEXT: v_writelane_b32 v23, s54, 15 +; GFX900-NEXT: v_writelane_b32 v23, s30, 15 ; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: v_writelane_b32 v22, s4, 0 ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec -; GFX900-NEXT: v_writelane_b32 v23, s55, 16 +; GFX900-NEXT: v_writelane_b32 v23, s31, 16 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND @@ -1311,23 +1312,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s55, v23, 16 -; GFX900-NEXT: v_readlane_b32 s54, v23, 15 -; GFX900-NEXT: v_readlane_b32 s53, v23, 14 -; GFX900-NEXT: v_readlane_b32 s52, v23, 13 -; GFX900-NEXT: v_readlane_b32 s51, v23, 12 -; GFX900-NEXT: v_readlane_b32 s50, v23, 11 -; GFX900-NEXT: v_readlane_b32 s49, v23, 10 -; GFX900-NEXT: v_readlane_b32 s48, v23, 9 -; GFX900-NEXT: v_readlane_b32 s39, v23, 8 -; GFX900-NEXT: v_readlane_b32 s38, v23, 7 -; GFX900-NEXT: v_readlane_b32 s37, v23, 6 -; GFX900-NEXT: v_readlane_b32 s36, v23, 5 -; GFX900-NEXT: v_readlane_b32 s35, v23, 4 -; GFX900-NEXT: v_readlane_b32 s34, v23, 3 -; GFX900-NEXT: v_readlane_b32 s33, v23, 2 -; GFX900-NEXT: v_readlane_b32 s31, v23, 1 -; GFX900-NEXT: v_readlane_b32 s30, v23, 0 +; GFX900-NEXT: v_readlane_b32 s30, v23, 15 +; GFX900-NEXT: v_readlane_b32 s31, v23, 16 +; GFX900-NEXT: v_readlane_b32 s55, v23, 14 +; GFX900-NEXT: v_readlane_b32 s54, v23, 13 +; GFX900-NEXT: v_readlane_b32 s53, v23, 12 +; GFX900-NEXT: v_readlane_b32 s52, v23, 11 +; GFX900-NEXT: v_readlane_b32 s51, v23, 10 +; GFX900-NEXT: v_readlane_b32 s50, v23, 9 +; GFX900-NEXT: v_readlane_b32 s49, v23, 8 +; GFX900-NEXT: v_readlane_b32 s48, v23, 7 +; GFX900-NEXT: v_readlane_b32 s39, v23, 6 +; GFX900-NEXT: v_readlane_b32 s38, v23, 5 +; GFX900-NEXT: v_readlane_b32 s37, v23, 4 +; GFX900-NEXT: v_readlane_b32 s36, v23, 3 +; GFX900-NEXT: v_readlane_b32 s35, v23, 2 +; GFX900-NEXT: v_readlane_b32 s34, v23, 1 +; GFX900-NEXT: v_readlane_b32 s33, v23, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload @@ -1344,28 +1345,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 ; GFX942-NEXT: scratch_store_dword off, v22, s2 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] -; GFX942-NEXT: v_writelane_b32 v22, s30, 0 -; GFX942-NEXT: v_writelane_b32 v22, s31, 1 -; GFX942-NEXT: v_writelane_b32 v22, s33, 2 -; GFX942-NEXT: v_writelane_b32 v22, s34, 3 -; GFX942-NEXT: v_writelane_b32 v22, s35, 4 -; GFX942-NEXT: v_writelane_b32 v22, s36, 5 -; GFX942-NEXT: v_writelane_b32 v22, s37, 6 -; GFX942-NEXT: v_writelane_b32 v22, s38, 7 -; GFX942-NEXT: v_writelane_b32 v22, s39, 8 -; GFX942-NEXT: v_writelane_b32 v22, s48, 9 -; GFX942-NEXT: v_writelane_b32 v22, s49, 10 -; GFX942-NEXT: v_writelane_b32 v22, s50, 11 -; GFX942-NEXT: v_writelane_b32 v22, s51, 12 -; GFX942-NEXT: v_writelane_b32 v22, s52, 13 -; GFX942-NEXT: v_writelane_b32 v22, s53, 14 +; GFX942-NEXT: v_writelane_b32 v22, s33, 0 +; GFX942-NEXT: v_writelane_b32 v22, s34, 1 +; GFX942-NEXT: v_writelane_b32 v22, s35, 2 +; GFX942-NEXT: v_writelane_b32 v22, s36, 3 +; GFX942-NEXT: v_writelane_b32 v22, s37, 4 +; GFX942-NEXT: v_writelane_b32 v22, s38, 5 +; GFX942-NEXT: v_writelane_b32 v22, s39, 6 +; GFX942-NEXT: v_writelane_b32 v22, s48, 7 +; GFX942-NEXT: v_writelane_b32 v22, s49, 8 +; GFX942-NEXT: v_writelane_b32 v22, s50, 9 +; GFX942-NEXT: v_writelane_b32 v22, s51, 10 +; GFX942-NEXT: v_writelane_b32 v22, s52, 11 +; GFX942-NEXT: v_writelane_b32 v22, s53, 12 +; GFX942-NEXT: v_writelane_b32 v22, s54, 13 +; GFX942-NEXT: v_writelane_b32 v22, s55, 14 ; GFX942-NEXT: s_add_i32 s0, s32, 64 -; GFX942-NEXT: v_writelane_b32 v22, s54, 15 +; GFX942-NEXT: v_writelane_b32 v22, s30, 15 ; GFX942-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-NEXT: v_writelane_b32 v22, s55, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use alloca0 v0 ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_writelane_b32 v22, s31, 16 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX942-NEXT: ;;#ASMEND @@ -1376,23 +1378,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: v_readlane_b32 s55, v22, 16 -; GFX942-NEXT: v_readlane_b32 s54, v22, 15 -; GFX942-NEXT: v_readlane_b32 s53, v22, 14 -; GFX942-NEXT: v_readlane_b32 s52, v22, 13 -; GFX942-NEXT: v_readlane_b32 s51, v22, 12 -; GFX942-NEXT: v_readlane_b32 s50, v22, 11 -; GFX942-NEXT: v_readlane_b32 s49, v22, 10 -; GFX942-NEXT: v_readlane_b32 s48, v22, 9 -; GFX942-NEXT: v_readlane_b32 s39, v22, 8 -; GFX942-NEXT: v_readlane_b32 s38, v22, 7 -; GFX942-NEXT: v_readlane_b32 s37, v22, 6 -; GFX942-NEXT: v_readlane_b32 s36, v22, 5 -; GFX942-NEXT: v_readlane_b32 s35, v22, 4 -; GFX942-NEXT: v_readlane_b32 s34, v22, 3 -; GFX942-NEXT: v_readlane_b32 s33, v22, 2 -; GFX942-NEXT: v_readlane_b32 s31, v22, 1 -; GFX942-NEXT: v_readlane_b32 s30, v22, 0 +; GFX942-NEXT: v_readlane_b32 s30, v22, 15 +; GFX942-NEXT: v_readlane_b32 s31, v22, 16 +; GFX942-NEXT: v_readlane_b32 s55, v22, 14 +; GFX942-NEXT: v_readlane_b32 s54, v22, 13 +; GFX942-NEXT: v_readlane_b32 s53, v22, 12 +; GFX942-NEXT: v_readlane_b32 s52, v22, 11 +; GFX942-NEXT: v_readlane_b32 s51, v22, 10 +; GFX942-NEXT: v_readlane_b32 s50, v22, 9 +; GFX942-NEXT: v_readlane_b32 s49, v22, 8 +; GFX942-NEXT: v_readlane_b32 s48, v22, 7 +; GFX942-NEXT: v_readlane_b32 s39, v22, 6 +; GFX942-NEXT: v_readlane_b32 s38, v22, 5 +; GFX942-NEXT: v_readlane_b32 s37, v22, 4 +; GFX942-NEXT: v_readlane_b32 s36, v22, 3 +; GFX942-NEXT: v_readlane_b32 s35, v22, 2 +; GFX942-NEXT: v_readlane_b32 s34, v22, 1 +; GFX942-NEXT: v_readlane_b32 s33, v22, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: s_add_i32 s2, s32, 0x8040 ; GFX942-NEXT: scratch_load_dword v22, off, s2 ; 4-byte Folded Reload @@ -1408,31 +1410,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v22, s30, 0 +; GFX10_1-NEXT: v_writelane_b32 v22, s33, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_1-NEXT: s_add_i32 s58, s4, 0x4240 -; GFX10_1-NEXT: v_writelane_b32 v22, s31, 1 +; GFX10_1-NEXT: v_writelane_b32 v22, s34, 1 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_writelane_b32 v22, s33, 2 -; GFX10_1-NEXT: v_writelane_b32 v22, s34, 3 -; GFX10_1-NEXT: v_writelane_b32 v22, s35, 4 -; GFX10_1-NEXT: v_writelane_b32 v22, s36, 5 -; GFX10_1-NEXT: v_writelane_b32 v22, s37, 6 -; GFX10_1-NEXT: v_writelane_b32 v22, s38, 7 -; GFX10_1-NEXT: v_writelane_b32 v22, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v22, s48, 9 -; GFX10_1-NEXT: v_writelane_b32 v22, s49, 10 -; GFX10_1-NEXT: v_writelane_b32 v22, s50, 11 -; GFX10_1-NEXT: v_writelane_b32 v22, s51, 12 -; GFX10_1-NEXT: v_writelane_b32 v22, s52, 13 -; GFX10_1-NEXT: v_writelane_b32 v22, s53, 14 -; GFX10_1-NEXT: v_writelane_b32 v22, s54, 15 -; GFX10_1-NEXT: v_writelane_b32 v22, s55, 16 +; GFX10_1-NEXT: v_writelane_b32 v22, s35, 2 +; GFX10_1-NEXT: v_writelane_b32 v22, s36, 3 +; GFX10_1-NEXT: v_writelane_b32 v22, s37, 4 +; GFX10_1-NEXT: v_writelane_b32 v22, s38, 5 +; GFX10_1-NEXT: v_writelane_b32 v22, s39, 6 +; GFX10_1-NEXT: v_writelane_b32 v22, s48, 7 +; GFX10_1-NEXT: v_writelane_b32 v22, s49, 8 +; GFX10_1-NEXT: v_writelane_b32 v22, s50, 9 +; GFX10_1-NEXT: v_writelane_b32 v22, s51, 10 +; GFX10_1-NEXT: v_writelane_b32 v22, s52, 11 +; GFX10_1-NEXT: v_writelane_b32 v22, s53, 12 +; GFX10_1-NEXT: v_writelane_b32 v22, s54, 13 +; GFX10_1-NEXT: v_writelane_b32 v22, s55, 14 +; GFX10_1-NEXT: v_writelane_b32 v22, s30, 15 +; GFX10_1-NEXT: v_writelane_b32 v22, s31, 16 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX10_1-NEXT: ;;#ASMEND @@ -1441,23 +1443,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s55, v22, 16 -; GFX10_1-NEXT: v_readlane_b32 s54, v22, 15 -; GFX10_1-NEXT: v_readlane_b32 s53, v22, 14 -; GFX10_1-NEXT: v_readlane_b32 s52, v22, 13 -; GFX10_1-NEXT: v_readlane_b32 s51, v22, 12 -; GFX10_1-NEXT: v_readlane_b32 s50, v22, 11 -; GFX10_1-NEXT: v_readlane_b32 s49, v22, 10 -; GFX10_1-NEXT: v_readlane_b32 s48, v22, 9 -; GFX10_1-NEXT: v_readlane_b32 s39, v22, 8 -; GFX10_1-NEXT: v_readlane_b32 s38, v22, 7 -; GFX10_1-NEXT: v_readlane_b32 s37, v22, 6 -; GFX10_1-NEXT: v_readlane_b32 s36, v22, 5 -; GFX10_1-NEXT: v_readlane_b32 s35, v22, 4 -; GFX10_1-NEXT: v_readlane_b32 s34, v22, 3 -; GFX10_1-NEXT: v_readlane_b32 s33, v22, 2 -; GFX10_1-NEXT: v_readlane_b32 s31, v22, 1 -; GFX10_1-NEXT: v_readlane_b32 s30, v22, 0 +; GFX10_1-NEXT: v_readlane_b32 s30, v22, 15 +; GFX10_1-NEXT: v_readlane_b32 s31, v22, 16 +; GFX10_1-NEXT: v_readlane_b32 s55, v22, 14 +; GFX10_1-NEXT: v_readlane_b32 s54, v22, 13 +; GFX10_1-NEXT: v_readlane_b32 s53, v22, 12 +; GFX10_1-NEXT: v_readlane_b32 s52, v22, 11 +; GFX10_1-NEXT: v_readlane_b32 s51, v22, 10 +; GFX10_1-NEXT: v_readlane_b32 s50, v22, 9 +; GFX10_1-NEXT: v_readlane_b32 s49, v22, 8 +; GFX10_1-NEXT: v_readlane_b32 s48, v22, 7 +; GFX10_1-NEXT: v_readlane_b32 s39, v22, 6 +; GFX10_1-NEXT: v_readlane_b32 s38, v22, 5 +; GFX10_1-NEXT: v_readlane_b32 s37, v22, 4 +; GFX10_1-NEXT: v_readlane_b32 s36, v22, 3 +; GFX10_1-NEXT: v_readlane_b32 s35, v22, 2 +; GFX10_1-NEXT: v_readlane_b32 s34, v22, 1 +; GFX10_1-NEXT: v_readlane_b32 s33, v22, 0 ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 ; GFX10_1-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload @@ -1473,31 +1475,31 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 ; GFX10_3-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v22, s30, 0 +; GFX10_3-NEXT: v_writelane_b32 v22, s33, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 ; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 ; GFX10_3-NEXT: s_add_i32 s58, s4, 0x4240 -; GFX10_3-NEXT: v_writelane_b32 v22, s31, 1 +; GFX10_3-NEXT: v_writelane_b32 v22, s34, 1 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_writelane_b32 v22, s33, 2 -; GFX10_3-NEXT: v_writelane_b32 v22, s34, 3 -; GFX10_3-NEXT: v_writelane_b32 v22, s35, 4 -; GFX10_3-NEXT: v_writelane_b32 v22, s36, 5 -; GFX10_3-NEXT: v_writelane_b32 v22, s37, 6 -; GFX10_3-NEXT: v_writelane_b32 v22, s38, 7 -; GFX10_3-NEXT: v_writelane_b32 v22, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v22, s48, 9 -; GFX10_3-NEXT: v_writelane_b32 v22, s49, 10 -; GFX10_3-NEXT: v_writelane_b32 v22, s50, 11 -; GFX10_3-NEXT: v_writelane_b32 v22, s51, 12 -; GFX10_3-NEXT: v_writelane_b32 v22, s52, 13 -; GFX10_3-NEXT: v_writelane_b32 v22, s53, 14 -; GFX10_3-NEXT: v_writelane_b32 v22, s54, 15 -; GFX10_3-NEXT: v_writelane_b32 v22, s55, 16 +; GFX10_3-NEXT: v_writelane_b32 v22, s35, 2 +; GFX10_3-NEXT: v_writelane_b32 v22, s36, 3 +; GFX10_3-NEXT: v_writelane_b32 v22, s37, 4 +; GFX10_3-NEXT: v_writelane_b32 v22, s38, 5 +; GFX10_3-NEXT: v_writelane_b32 v22, s39, 6 +; GFX10_3-NEXT: v_writelane_b32 v22, s48, 7 +; GFX10_3-NEXT: v_writelane_b32 v22, s49, 8 +; GFX10_3-NEXT: v_writelane_b32 v22, s50, 9 +; GFX10_3-NEXT: v_writelane_b32 v22, s51, 10 +; GFX10_3-NEXT: v_writelane_b32 v22, s52, 11 +; GFX10_3-NEXT: v_writelane_b32 v22, s53, 12 +; GFX10_3-NEXT: v_writelane_b32 v22, s54, 13 +; GFX10_3-NEXT: v_writelane_b32 v22, s55, 14 +; GFX10_3-NEXT: v_writelane_b32 v22, s30, 15 +; GFX10_3-NEXT: v_writelane_b32 v22, s31, 16 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX10_3-NEXT: ;;#ASMEND @@ -1506,23 +1508,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s55, v22, 16 -; GFX10_3-NEXT: v_readlane_b32 s54, v22, 15 -; GFX10_3-NEXT: v_readlane_b32 s53, v22, 14 -; GFX10_3-NEXT: v_readlane_b32 s52, v22, 13 -; GFX10_3-NEXT: v_readlane_b32 s51, v22, 12 -; GFX10_3-NEXT: v_readlane_b32 s50, v22, 11 -; GFX10_3-NEXT: v_readlane_b32 s49, v22, 10 -; GFX10_3-NEXT: v_readlane_b32 s48, v22, 9 -; GFX10_3-NEXT: v_readlane_b32 s39, v22, 8 -; GFX10_3-NEXT: v_readlane_b32 s38, v22, 7 -; GFX10_3-NEXT: v_readlane_b32 s37, v22, 6 -; GFX10_3-NEXT: v_readlane_b32 s36, v22, 5 -; GFX10_3-NEXT: v_readlane_b32 s35, v22, 4 -; GFX10_3-NEXT: v_readlane_b32 s34, v22, 3 -; GFX10_3-NEXT: v_readlane_b32 s33, v22, 2 -; GFX10_3-NEXT: v_readlane_b32 s31, v22, 1 -; GFX10_3-NEXT: v_readlane_b32 s30, v22, 0 +; GFX10_3-NEXT: v_readlane_b32 s30, v22, 15 +; GFX10_3-NEXT: v_readlane_b32 s31, v22, 16 +; GFX10_3-NEXT: v_readlane_b32 s55, v22, 14 +; GFX10_3-NEXT: v_readlane_b32 s54, v22, 13 +; GFX10_3-NEXT: v_readlane_b32 s53, v22, 12 +; GFX10_3-NEXT: v_readlane_b32 s52, v22, 11 +; GFX10_3-NEXT: v_readlane_b32 s51, v22, 10 +; GFX10_3-NEXT: v_readlane_b32 s50, v22, 9 +; GFX10_3-NEXT: v_readlane_b32 s49, v22, 8 +; GFX10_3-NEXT: v_readlane_b32 s48, v22, 7 +; GFX10_3-NEXT: v_readlane_b32 s39, v22, 6 +; GFX10_3-NEXT: v_readlane_b32 s38, v22, 5 +; GFX10_3-NEXT: v_readlane_b32 s37, v22, 4 +; GFX10_3-NEXT: v_readlane_b32 s36, v22, 3 +; GFX10_3-NEXT: v_readlane_b32 s35, v22, 2 +; GFX10_3-NEXT: v_readlane_b32 s34, v22, 1 +; GFX10_3-NEXT: v_readlane_b32 s33, v22, 0 ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 ; GFX10_3-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload @@ -1537,30 +1539,30 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 ; GFX11-NEXT: scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v22, s30, 0 +; GFX11-NEXT: v_writelane_b32 v22, s33, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 ; GFX11-NEXT: s_add_i32 s58, s32, 0x4240 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_writelane_b32 v22, s31, 1 +; GFX11-NEXT: v_writelane_b32 v22, s34, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v22, s33, 2 -; GFX11-NEXT: v_writelane_b32 v22, s34, 3 -; GFX11-NEXT: v_writelane_b32 v22, s35, 4 -; GFX11-NEXT: v_writelane_b32 v22, s36, 5 -; GFX11-NEXT: v_writelane_b32 v22, s37, 6 -; GFX11-NEXT: v_writelane_b32 v22, s38, 7 -; GFX11-NEXT: v_writelane_b32 v22, s39, 8 -; GFX11-NEXT: v_writelane_b32 v22, s48, 9 -; GFX11-NEXT: v_writelane_b32 v22, s49, 10 -; GFX11-NEXT: v_writelane_b32 v22, s50, 11 -; GFX11-NEXT: v_writelane_b32 v22, s51, 12 -; GFX11-NEXT: v_writelane_b32 v22, s52, 13 -; GFX11-NEXT: v_writelane_b32 v22, s53, 14 -; GFX11-NEXT: v_writelane_b32 v22, s54, 15 -; GFX11-NEXT: v_writelane_b32 v22, s55, 16 +; GFX11-NEXT: v_writelane_b32 v22, s35, 2 +; GFX11-NEXT: v_writelane_b32 v22, s36, 3 +; GFX11-NEXT: v_writelane_b32 v22, s37, 4 +; GFX11-NEXT: v_writelane_b32 v22, s38, 5 +; GFX11-NEXT: v_writelane_b32 v22, s39, 6 +; GFX11-NEXT: v_writelane_b32 v22, s48, 7 +; GFX11-NEXT: v_writelane_b32 v22, s49, 8 +; GFX11-NEXT: v_writelane_b32 v22, s50, 9 +; GFX11-NEXT: v_writelane_b32 v22, s51, 10 +; GFX11-NEXT: v_writelane_b32 v22, s52, 11 +; GFX11-NEXT: v_writelane_b32 v22, s53, 12 +; GFX11-NEXT: v_writelane_b32 v22, s54, 13 +; GFX11-NEXT: v_writelane_b32 v22, s55, 14 +; GFX11-NEXT: v_writelane_b32 v22, s30, 15 +; GFX11-NEXT: v_writelane_b32 v22, s31, 16 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX11-NEXT: ;;#ASMEND @@ -1570,23 +1572,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readlane_b32 s55, v22, 16 -; GFX11-NEXT: v_readlane_b32 s54, v22, 15 -; GFX11-NEXT: v_readlane_b32 s53, v22, 14 -; GFX11-NEXT: v_readlane_b32 s52, v22, 13 -; GFX11-NEXT: v_readlane_b32 s51, v22, 12 -; GFX11-NEXT: v_readlane_b32 s50, v22, 11 -; GFX11-NEXT: v_readlane_b32 s49, v22, 10 -; GFX11-NEXT: v_readlane_b32 s48, v22, 9 -; GFX11-NEXT: v_readlane_b32 s39, v22, 8 -; GFX11-NEXT: v_readlane_b32 s38, v22, 7 -; GFX11-NEXT: v_readlane_b32 s37, v22, 6 -; GFX11-NEXT: v_readlane_b32 s36, v22, 5 -; GFX11-NEXT: v_readlane_b32 s35, v22, 4 -; GFX11-NEXT: v_readlane_b32 s34, v22, 3 -; GFX11-NEXT: v_readlane_b32 s33, v22, 2 -; GFX11-NEXT: v_readlane_b32 s31, v22, 1 -; GFX11-NEXT: v_readlane_b32 s30, v22, 0 +; GFX11-NEXT: v_readlane_b32 s30, v22, 15 +; GFX11-NEXT: v_readlane_b32 s31, v22, 16 +; GFX11-NEXT: v_readlane_b32 s55, v22, 14 +; GFX11-NEXT: v_readlane_b32 s54, v22, 13 +; GFX11-NEXT: v_readlane_b32 s53, v22, 12 +; GFX11-NEXT: v_readlane_b32 s52, v22, 11 +; GFX11-NEXT: v_readlane_b32 s51, v22, 10 +; GFX11-NEXT: v_readlane_b32 s50, v22, 9 +; GFX11-NEXT: v_readlane_b32 s49, v22, 8 +; GFX11-NEXT: v_readlane_b32 s48, v22, 7 +; GFX11-NEXT: v_readlane_b32 s39, v22, 6 +; GFX11-NEXT: v_readlane_b32 s38, v22, 5 +; GFX11-NEXT: v_readlane_b32 s37, v22, 4 +; GFX11-NEXT: v_readlane_b32 s36, v22, 3 +; GFX11-NEXT: v_readlane_b32 s35, v22, 2 +; GFX11-NEXT: v_readlane_b32 s34, v22, 1 +; GFX11-NEXT: v_readlane_b32 s33, v22, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 ; GFX11-NEXT: scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload @@ -1605,29 +1607,29 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v22, s30, 0 +; GFX12-NEXT: v_writelane_b32 v22, s33, 0 ; GFX12-NEXT: s_add_co_i32 s58, s32, 0x4200 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_writelane_b32 v22, s31, 1 -; GFX12-NEXT: v_writelane_b32 v22, s33, 2 -; GFX12-NEXT: v_writelane_b32 v22, s34, 3 -; GFX12-NEXT: v_writelane_b32 v22, s35, 4 -; GFX12-NEXT: v_writelane_b32 v22, s36, 5 -; GFX12-NEXT: v_writelane_b32 v22, s37, 6 -; GFX12-NEXT: v_writelane_b32 v22, s38, 7 -; GFX12-NEXT: v_writelane_b32 v22, s39, 8 -; GFX12-NEXT: v_writelane_b32 v22, s48, 9 -; GFX12-NEXT: v_writelane_b32 v22, s49, 10 -; GFX12-NEXT: v_writelane_b32 v22, s50, 11 -; GFX12-NEXT: v_writelane_b32 v22, s51, 12 -; GFX12-NEXT: v_writelane_b32 v22, s52, 13 -; GFX12-NEXT: v_writelane_b32 v22, s53, 14 -; GFX12-NEXT: v_writelane_b32 v22, s54, 15 -; GFX12-NEXT: v_writelane_b32 v22, s55, 16 +; GFX12-NEXT: v_writelane_b32 v22, s34, 1 +; GFX12-NEXT: v_writelane_b32 v22, s35, 2 +; GFX12-NEXT: v_writelane_b32 v22, s36, 3 +; GFX12-NEXT: v_writelane_b32 v22, s37, 4 +; GFX12-NEXT: v_writelane_b32 v22, s38, 5 +; GFX12-NEXT: v_writelane_b32 v22, s39, 6 +; GFX12-NEXT: v_writelane_b32 v22, s48, 7 +; GFX12-NEXT: v_writelane_b32 v22, s49, 8 +; GFX12-NEXT: v_writelane_b32 v22, s50, 9 +; GFX12-NEXT: v_writelane_b32 v22, s51, 10 +; GFX12-NEXT: v_writelane_b32 v22, s52, 11 +; GFX12-NEXT: v_writelane_b32 v22, s53, 12 +; GFX12-NEXT: v_writelane_b32 v22, s54, 13 +; GFX12-NEXT: v_writelane_b32 v22, s55, 14 +; GFX12-NEXT: v_writelane_b32 v22, s30, 15 +; GFX12-NEXT: v_writelane_b32 v22, s31, 16 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX12-NEXT: ;;#ASMEND @@ -1637,23 +1639,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s54, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s55, v22, 16 -; GFX12-NEXT: v_readlane_b32 s54, v22, 15 -; GFX12-NEXT: v_readlane_b32 s53, v22, 14 -; GFX12-NEXT: v_readlane_b32 s52, v22, 13 -; GFX12-NEXT: v_readlane_b32 s51, v22, 12 -; GFX12-NEXT: v_readlane_b32 s50, v22, 11 -; GFX12-NEXT: v_readlane_b32 s49, v22, 10 -; GFX12-NEXT: v_readlane_b32 s48, v22, 9 -; GFX12-NEXT: v_readlane_b32 s39, v22, 8 -; GFX12-NEXT: v_readlane_b32 s38, v22, 7 -; GFX12-NEXT: v_readlane_b32 s37, v22, 6 -; GFX12-NEXT: v_readlane_b32 s36, v22, 5 -; GFX12-NEXT: v_readlane_b32 s35, v22, 4 -; GFX12-NEXT: v_readlane_b32 s34, v22, 3 -; GFX12-NEXT: v_readlane_b32 s33, v22, 2 -; GFX12-NEXT: v_readlane_b32 s31, v22, 1 -; GFX12-NEXT: v_readlane_b32 s30, v22, 0 +; GFX12-NEXT: v_readlane_b32 s30, v22, 15 +; GFX12-NEXT: v_readlane_b32 s31, v22, 16 +; GFX12-NEXT: v_readlane_b32 s55, v22, 14 +; GFX12-NEXT: v_readlane_b32 s54, v22, 13 +; GFX12-NEXT: v_readlane_b32 s53, v22, 12 +; GFX12-NEXT: v_readlane_b32 s52, v22, 11 +; GFX12-NEXT: v_readlane_b32 s51, v22, 10 +; GFX12-NEXT: v_readlane_b32 s50, v22, 9 +; GFX12-NEXT: v_readlane_b32 s49, v22, 8 +; GFX12-NEXT: v_readlane_b32 s48, v22, 7 +; GFX12-NEXT: v_readlane_b32 s39, v22, 6 +; GFX12-NEXT: v_readlane_b32 s38, v22, 5 +; GFX12-NEXT: v_readlane_b32 s37, v22, 4 +; GFX12-NEXT: v_readlane_b32 s36, v22, 3 +; GFX12-NEXT: v_readlane_b32 s35, v22, 2 +; GFX12-NEXT: v_readlane_b32 s34, v22, 1 +; GFX12-NEXT: v_readlane_b32 s33, v22, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 33cd598aae9b5..7b87cb014ecbf 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -194,22 +194,22 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_writelane_b32 v43, s4, 5 -; GFX9-NEXT: v_writelane_b32 v43, s30, 0 -; GFX9-NEXT: v_writelane_b32 v43, s31, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v43, s34, 2 -; GFX9-NEXT: v_writelane_b32 v43, s36, 3 +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v43, s36, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v43, s37, 4 +; GFX9-NEXT: v_writelane_b32 v43, s37, 2 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_mov_b32_e32 v40, v1 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_writelane_b32 v43, s30, 3 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 +; GFX9-NEXT: v_writelane_b32 v43, s31, 4 ; GFX9-NEXT: s_mov_b32 s34, s15 ; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -224,11 +224,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s37, v43, 4 -; GFX9-NEXT: v_readlane_b32 s36, v43, 3 -; GFX9-NEXT: v_readlane_b32 s34, v43, 2 -; GFX9-NEXT: v_readlane_b32 s31, v43, 1 -; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: v_readlane_b32 s30, v43, 3 +; GFX9-NEXT: v_readlane_b32 s31, v43, 4 +; GFX9-NEXT: v_readlane_b32 s37, v43, 2 +; GFX9-NEXT: v_readlane_b32 s36, v43, 1 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v43, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index 65446a036c91b..878302e4865bb 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -47,8 +47,8 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 @@ -190,8 +190,8 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload @@ -224,8 +224,8 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index ccaf0ac5377e4..8394b325bee6d 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -29,8 +29,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -68,8 +68,8 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 6b6c60ebe2a9e..133cc166c3311 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -247,8 +247,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll index dba10f19eb500..ef5681798fb19 100644 --- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll @@ -20,9 +20,9 @@ define void @test_remat_s_getpc_b64() { ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -45,8 +45,8 @@ define void @test_remat_s_getpc_b64() { ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: v_readlane_b32 s31, v2, 1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload @@ -79,8 +79,8 @@ define void @test_remat_s_getpc_b64() { ; GFX12-NEXT: s_sext_i32_i16 s1, s1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_readlane_b32 s31, v2, 1 ; GFX12-NEXT: v_readlane_b32 s30, v2, 0 +; GFX12-NEXT: v_readlane_b32 s31, v2, 1 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX12-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 925984b15367d..4a2829947f9a0 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -28,16 +28,16 @@ body: | ; GCN-LABEL: name: test_main ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0 + ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $sgpr30_sgpr31 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33 ; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32 ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.68, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr2 @@ -66,48 +66,48 @@ body: | ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr2 ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr2 ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr2 - ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr2 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr3 - ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr3 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr4 - ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr4 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr64, 26, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr65, 27, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr66, 28, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr67, 29, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 30, $vgpr2 + ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 31, $vgpr2 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr70, 0, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr71, 1, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr72, 2, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr73, 3, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr74, 4, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr75, 5, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr76, 6, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr77, 7, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr78, 8, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr79, 9, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr80, 10, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr81, 11, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr82, 12, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr83, 13, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr84, 14, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr85, 15, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr86, 16, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr87, 17, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr88, 18, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr89, 19, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr90, 20, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr91, 21, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr92, 22, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr93, 23, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr94, 24, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr95, 25, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr96, 26, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr97, 27, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr98, 28, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr99, 29, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 30, $vgpr3 + ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 31, $vgpr3 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr102, 0, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr103, 1, $vgpr4 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr30, 2, $vgpr4, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + ; GCN-NEXT: $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr31, 3, $vgpr4, implicit $sgpr30_sgpr31 ; GCN-NEXT: $sgpr22 = IMPLICIT_DEF ; GCN-NEXT: $vgpr5 = IMPLICIT_DEF ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5 @@ -130,48 +130,48 @@ body: | ; GCN-NEXT: bb.3: ; GCN-NEXT: liveins: $vcc_hi ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3 - ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2 - ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1 - ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0 - ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31 - ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30 - ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29 - ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28 - ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27 - ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26 - ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25 - ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24 - ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23 - ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22 - ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21 - ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20 - ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19 - ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18 - ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17 - ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16 - ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15 - ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14 - ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13 - ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12 - ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11 - ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10 - ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9 - ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8 - ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7 - ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6 - ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5 - ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4 - ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3 - ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 - ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 - ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 - ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31 - ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30 - ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29 - ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28 - ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27 - ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26 + ; GCN-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2, implicit-def $sgpr30_sgpr31 + ; GCN-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3 + ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1 + ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0 + ; GCN-NEXT: $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31 + ; GCN-NEXT: $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30 + ; GCN-NEXT: $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29 + ; GCN-NEXT: $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28 + ; GCN-NEXT: $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27 + ; GCN-NEXT: $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26 + ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25 + ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24 + ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23 + ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22 + ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21 + ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20 + ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19 + ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18 + ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17 + ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16 + ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15 + ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14 + ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13 + ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12 + ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11 + ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10 + ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9 + ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8 + ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7 + ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6 + ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5 + ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4 + ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3 + ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2 + ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 + ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 + ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31 + ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30 + ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29 + ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28 + ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27 + ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26 ; GCN-NEXT: $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25 ; GCN-NEXT: $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24 ; GCN-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23 @@ -200,11 +200,11 @@ body: | ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 ; GCN-NEXT: $sgpr32 = frame-destroy COPY $sgpr33 ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) - ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) - ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) - ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) - ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5) + ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.68, addrspace 5) + ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) + ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) + ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) + ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi ; GCN-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 702953c56a5cb..cb54b0ba629c3 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -152,8 +152,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v255, 1 ; GCN-NEXT: v_readlane_b32 s30, v255, 0 +; GCN-NEXT: v_readlane_b32 s31, v255, 1 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -445,8 +445,8 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v254, 1 ; GCN-NEXT: v_readlane_b32 s30, v254, 0 +; GCN-NEXT: v_readlane_b32 s31, v254, 1 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -1632,21 +1632,14 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 s[16:17], exec -; GCN-NEXT: s_mov_b64 exec, 1 +; GCN-NEXT: s_mov_b64 exec, 3 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 ; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 exec, s[16:17] -; GCN-NEXT: s_mov_b64 s[16:17], exec -; GCN-NEXT: s_mov_b64 exec, 1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: v_writelane_b32 v0, s31, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function_ipra@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function_ipra@rel32@hi+12 @@ -1656,20 +1649,12 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v0, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 1 +; GCN-NEXT: s_mov_b64 exec, 3 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s30, v0, 0 +; GCN-NEXT: v_readlane_b32 s31, v0, 1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll index 1c2215d39dc02..feaca47f98e36 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll @@ -14610,13 +14610,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s30 ; GFX900-NEXT: s_mov_b32 s9, s31 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -14639,13 +14639,13 @@ define void @s_shuffle_v2i64_v8i64__15_2() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s30 ; GFX90A-NEXT: s_mov_b32 s9, s31 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -14750,13 +14750,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() { ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_mov_b32 s8, s30 ; GFX900-NEXT: s_mov_b32 s9, s31 +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s10, s12 ; GFX900-NEXT: s_mov_b32 s11, s13 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -14779,13 +14779,13 @@ define void @s_shuffle_v2i64_v8i64__15_4() { ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: s_mov_b32 s8, s30 ; GFX90A-NEXT: s_mov_b32 s9, s31 +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s10, s12 ; GFX90A-NEXT: s_mov_b32 s11, s13 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -14802,19 +14802,19 @@ define void @s_shuffle_v2i64_v8i64__15_4() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s30 ; GFX942-NEXT: s_mov_b32 s9, s31 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -14845,12 +14845,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() { ; GFX900-NEXT: s_mov_b32 s12, s30 ; GFX900-NEXT: s_mov_b32 s13, s31 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -14874,12 +14874,12 @@ define void @s_shuffle_v2i64_v8i64__15_5() { ; GFX90A-NEXT: s_mov_b32 s12, s30 ; GFX90A-NEXT: s_mov_b32 s13, s31 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -14999,22 +14999,22 @@ define void @s_shuffle_v2i64_v8i64__15_6() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s30 ; GFX942-NEXT: s_mov_b32 s9, s31 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -15120,6 +15120,7 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -15127,12 +15128,12 @@ define void @s_shuffle_v2i64_v8i64__15_7() { ; GFX942-NEXT: s_mov_b32 s12, s30 ; GFX942-NEXT: s_mov_b32 s13, s31 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -16167,20 +16168,21 @@ define void @s_shuffle_v2i64_v8i64__12_0() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s10, s16 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s11, s17 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -16890,20 +16892,21 @@ define void @s_shuffle_v2i64_v8i64__12_1() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s10, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s11, s19 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -17481,6 +17484,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s12 @@ -17489,7 +17493,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -17510,6 +17513,7 @@ define void @s_shuffle_v2i64_v8i64__9_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s12 @@ -17518,7 +17522,6 @@ define void @s_shuffle_v2i64_v8i64__9_2() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -17565,13 +17568,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s10, s20 ; GFX900-NEXT: s_mov_b32 s11, s21 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -17592,13 +17595,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s10, s20 ; GFX90A-NEXT: s_mov_b32 s11, s21 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -17612,6 +17615,7 @@ define void @s_shuffle_v2i64_v8i64__10_2() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -17620,13 +17624,13 @@ define void @s_shuffle_v2i64_v8i64__10_2() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s10, s20 ; GFX942-NEXT: s_mov_b32 s11, s21 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -17654,6 +17658,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 ; GFX900-NEXT: s_mov_b32 s10, s12 @@ -17662,7 +17667,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -17683,6 +17687,7 @@ define void @s_shuffle_v2i64_v8i64__11_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 ; GFX90A-NEXT: s_mov_b32 s10, s12 @@ -17691,7 +17696,6 @@ define void @s_shuffle_v2i64_v8i64__11_2() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -17798,6 +17802,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s26 ; GFX900-NEXT: s_mov_b32 s9, s27 ; GFX900-NEXT: s_mov_b32 s10, s12 @@ -17806,7 +17811,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -17827,6 +17831,7 @@ define void @s_shuffle_v2i64_v8i64__13_2() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s26 ; GFX90A-NEXT: s_mov_b32 s9, s27 ; GFX90A-NEXT: s_mov_b32 s10, s12 @@ -17835,7 +17840,6 @@ define void @s_shuffle_v2i64_v8i64__13_2() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -18315,13 +18319,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s10, s22 ; GFX900-NEXT: s_mov_b32 s11, s23 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -18342,13 +18346,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s10, s22 ; GFX90A-NEXT: s_mov_b32 s11, s23 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -18362,6 +18366,7 @@ define void @s_shuffle_v2i64_v8i64__10_3() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -18370,13 +18375,13 @@ define void @s_shuffle_v2i64_v8i64__10_3() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s10, s22 ; GFX942-NEXT: s_mov_b32 s11, s23 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -18950,6 +18955,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s18 ; GFX900-NEXT: s_mov_b32 s9, s19 ; GFX900-NEXT: s_mov_b32 s10, s12 @@ -18958,7 +18964,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -18979,6 +18984,7 @@ define void @s_shuffle_v2i64_v8i64__9_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s18 ; GFX90A-NEXT: s_mov_b32 s9, s19 ; GFX90A-NEXT: s_mov_b32 s10, s12 @@ -18987,7 +18993,6 @@ define void @s_shuffle_v2i64_v8i64__9_4() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -19004,19 +19009,19 @@ define void @s_shuffle_v2i64_v8i64__9_4() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -19100,6 +19105,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 ; GFX900-NEXT: s_mov_b32 s10, s12 @@ -19108,7 +19114,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -19129,6 +19134,7 @@ define void @s_shuffle_v2i64_v8i64__11_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 ; GFX90A-NEXT: s_mov_b32 s10, s12 @@ -19137,7 +19143,6 @@ define void @s_shuffle_v2i64_v8i64__11_4() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -19154,19 +19159,19 @@ define void @s_shuffle_v2i64_v8i64__11_4() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -19197,12 +19202,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() { ; GFX900-NEXT: s_mov_b32 s26, s12 ; GFX900-NEXT: s_mov_b32 s27, s13 ; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -19226,12 +19231,12 @@ define void @s_shuffle_v2i64_v8i64__12_4() { ; GFX90A-NEXT: s_mov_b32 s26, s12 ; GFX90A-NEXT: s_mov_b32 s27, s13 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -19276,6 +19281,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s26 ; GFX900-NEXT: s_mov_b32 s9, s27 ; GFX900-NEXT: s_mov_b32 s10, s12 @@ -19284,7 +19290,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -19305,6 +19310,7 @@ define void @s_shuffle_v2i64_v8i64__13_4() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s26 ; GFX90A-NEXT: s_mov_b32 s9, s27 ; GFX90A-NEXT: s_mov_b32 s10, s12 @@ -19313,7 +19319,6 @@ define void @s_shuffle_v2i64_v8i64__13_4() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -19330,19 +19335,19 @@ define void @s_shuffle_v2i64_v8i64__13_4() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s27 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -19374,11 +19379,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() { ; GFX900-NEXT: s_mov_b32 s31, s13 ; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -19403,11 +19408,11 @@ define void @s_shuffle_v2i64_v8i64__14_4() { ; GFX90A-NEXT: s_mov_b32 s31, s13 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -19874,12 +19879,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() { ; GFX900-NEXT: s_mov_b32 s12, s18 ; GFX900-NEXT: s_mov_b32 s13, s19 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -19903,12 +19908,12 @@ define void @s_shuffle_v2i64_v8i64__9_5() { ; GFX90A-NEXT: s_mov_b32 s12, s18 ; GFX90A-NEXT: s_mov_b32 s13, s19 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -20012,12 +20017,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() { ; GFX900-NEXT: s_mov_b32 s12, s22 ; GFX900-NEXT: s_mov_b32 s13, s23 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -20041,12 +20046,12 @@ define void @s_shuffle_v2i64_v8i64__11_5() { ; GFX90A-NEXT: s_mov_b32 s12, s22 ; GFX90A-NEXT: s_mov_b32 s13, s23 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -20094,12 +20099,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() { ; GFX900-NEXT: s_mov_b32 s26, s14 ; GFX900-NEXT: s_mov_b32 s27, s15 ; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -20123,12 +20128,12 @@ define void @s_shuffle_v2i64_v8i64__12_5() { ; GFX90A-NEXT: s_mov_b32 s26, s14 ; GFX90A-NEXT: s_mov_b32 s27, s15 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -20176,12 +20181,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() { ; GFX900-NEXT: s_mov_b32 s12, s26 ; GFX900-NEXT: s_mov_b32 s13, s27 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -20205,12 +20210,12 @@ define void @s_shuffle_v2i64_v8i64__13_5() { ; GFX90A-NEXT: s_mov_b32 s12, s26 ; GFX90A-NEXT: s_mov_b32 s13, s27 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -20259,11 +20264,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() { ; GFX900-NEXT: s_mov_b32 s31, s15 ; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -20288,11 +20293,11 @@ define void @s_shuffle_v2i64_v8i64__14_5() { ; GFX90A-NEXT: s_mov_b32 s31, s15 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -20846,22 +20851,22 @@ define void @s_shuffle_v2i64_v8i64__9_6() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s19 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -21020,22 +21025,22 @@ define void @s_shuffle_v2i64_v8i64__11_6() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s22 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s23 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -21244,22 +21249,22 @@ define void @s_shuffle_v2i64_v8i64__13_6() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s10, s12 +; GFX942-NEXT: s_mov_b32 s11, s13 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s26 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s27 -; GFX942-NEXT: s_mov_b32 s10, s12 -; GFX942-NEXT: s_mov_b32 s11, s13 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -21362,10 +21367,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND @@ -21373,11 +21379,11 @@ define void @s_shuffle_v2i64_v8i64__14_6() { ; GFX942-NEXT: s_mov_b32 s31, s13 ; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -21909,6 +21915,7 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -21916,12 +21923,12 @@ define void @s_shuffle_v2i64_v8i64__9_7() { ; GFX942-NEXT: s_mov_b32 s12, s18 ; GFX942-NEXT: s_mov_b32 s13, s19 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -22083,6 +22090,7 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -22090,12 +22098,12 @@ define void @s_shuffle_v2i64_v8i64__11_7() { ; GFX942-NEXT: s_mov_b32 s12, s22 ; GFX942-NEXT: s_mov_b32 s13, s23 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -22307,6 +22315,7 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -22314,12 +22323,12 @@ define void @s_shuffle_v2i64_v8i64__13_7() { ; GFX942-NEXT: s_mov_b32 s12, s26 ; GFX942-NEXT: s_mov_b32 s13, s27 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -22422,10 +22431,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND @@ -22433,11 +22443,11 @@ define void @s_shuffle_v2i64_v8i64__14_7() { ; GFX942-NEXT: s_mov_b32 s31, s15 ; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -23434,12 +23444,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() { ; GFX900-NEXT: s_mov_b32 s14, s18 ; GFX900-NEXT: s_mov_b32 s15, s19 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -23463,12 +23473,12 @@ define void @s_shuffle_v2i64_v8i64__4_9() { ; GFX90A-NEXT: s_mov_b32 s14, s18 ; GFX90A-NEXT: s_mov_b32 s15, s19 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -23513,13 +23523,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[8:23] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s26 ; GFX900-NEXT: s_mov_b32 s9, s27 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -23540,13 +23550,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[8:23] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s26 ; GFX90A-NEXT: s_mov_b32 s9, s27 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -23560,6 +23570,7 @@ define void @s_shuffle_v2i64_v8i64__5_9() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -23568,13 +23579,13 @@ define void @s_shuffle_v2i64_v8i64__5_9() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[8:23] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s8, s26 ; GFX942-NEXT: s_mov_b32 s9, s27 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -23680,6 +23691,7 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -23687,12 +23699,12 @@ define void @s_shuffle_v2i64_v8i64__6_9() { ; GFX942-NEXT: s_mov_b32 s14, s18 ; GFX942-NEXT: s_mov_b32 s15, s19 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -24284,12 +24296,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() { ; GFX900-NEXT: s_mov_b32 s14, s20 ; GFX900-NEXT: s_mov_b32 s15, s21 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -24313,12 +24325,12 @@ define void @s_shuffle_v2i64_v8i64__4_10() { ; GFX90A-NEXT: s_mov_b32 s14, s20 ; GFX90A-NEXT: s_mov_b32 s15, s21 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -24363,6 +24375,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s20 @@ -24371,7 +24384,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -24392,6 +24404,7 @@ define void @s_shuffle_v2i64_v8i64__5_10() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s20 @@ -24400,7 +24413,6 @@ define void @s_shuffle_v2i64_v8i64__5_10() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -24524,6 +24536,7 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -24531,12 +24544,12 @@ define void @s_shuffle_v2i64_v8i64__6_10() { ; GFX942-NEXT: s_mov_b32 s14, s20 ; GFX942-NEXT: s_mov_b32 s15, s21 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -24639,22 +24652,22 @@ define void @s_shuffle_v2i64_v8i64__7_10() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s20 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s11, s21 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -25235,13 +25248,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[4:19] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s22 ; GFX900-NEXT: s_mov_b32 s9, s23 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -25262,13 +25275,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[4:19] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s22 ; GFX90A-NEXT: s_mov_b32 s9, s23 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -25282,6 +25295,7 @@ define void @s_shuffle_v2i64_v8i64__3_11() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -25290,13 +25304,13 @@ define void @s_shuffle_v2i64_v8i64__3_11() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[4:19] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s8, s22 ; GFX942-NEXT: s_mov_b32 s9, s23 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -25327,12 +25341,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() { ; GFX900-NEXT: s_mov_b32 s14, s22 ; GFX900-NEXT: s_mov_b32 s15, s23 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -25356,12 +25370,12 @@ define void @s_shuffle_v2i64_v8i64__4_11() { ; GFX90A-NEXT: s_mov_b32 s14, s22 ; GFX90A-NEXT: s_mov_b32 s15, s23 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -25540,6 +25554,7 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -25547,12 +25562,12 @@ define void @s_shuffle_v2i64_v8i64__6_11() { ; GFX942-NEXT: s_mov_b32 s14, s22 ; GFX942-NEXT: s_mov_b32 s15, s23 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -26144,12 +26159,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() { ; GFX900-NEXT: s_mov_b32 s14, s24 ; GFX900-NEXT: s_mov_b32 s15, s25 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -26173,12 +26188,12 @@ define void @s_shuffle_v2i64_v8i64__4_12() { ; GFX90A-NEXT: s_mov_b32 s14, s24 ; GFX90A-NEXT: s_mov_b32 s15, s25 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -26223,6 +26238,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s24 @@ -26231,7 +26247,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -26252,6 +26267,7 @@ define void @s_shuffle_v2i64_v8i64__5_12() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s24 @@ -26260,7 +26276,6 @@ define void @s_shuffle_v2i64_v8i64__5_12() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -26384,6 +26399,7 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -26391,12 +26407,12 @@ define void @s_shuffle_v2i64_v8i64__6_12() { ; GFX942-NEXT: s_mov_b32 s14, s24 ; GFX942-NEXT: s_mov_b32 s15, s25 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -26499,22 +26515,22 @@ define void @s_shuffle_v2i64_v8i64__7_12() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s24 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s11, s25 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -26880,20 +26896,21 @@ define void @s_shuffle_v2i64_v8i64__1_13() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s8, s18 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s9, s19 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -27040,12 +27057,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() { ; GFX900-NEXT: s_mov_b32 s14, s26 ; GFX900-NEXT: s_mov_b32 s15, s27 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -27069,12 +27086,12 @@ define void @s_shuffle_v2i64_v8i64__4_13() { ; GFX90A-NEXT: s_mov_b32 s14, s26 ; GFX90A-NEXT: s_mov_b32 s15, s27 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -27122,12 +27139,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() { ; GFX900-NEXT: s_mov_b32 s24, s14 ; GFX900-NEXT: s_mov_b32 s25, s15 ; GFX900-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -27151,12 +27168,12 @@ define void @s_shuffle_v2i64_v8i64__5_13() { ; GFX90A-NEXT: s_mov_b32 s24, s14 ; GFX90A-NEXT: s_mov_b32 s25, s15 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[24:25] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[26:27] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -27279,6 +27296,7 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -27286,12 +27304,12 @@ define void @s_shuffle_v2i64_v8i64__6_13() { ; GFX942-NEXT: s_mov_b32 s14, s26 ; GFX942-NEXT: s_mov_b32 s15, s27 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -27997,12 +28015,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() { ; GFX900-NEXT: s_mov_b32 s14, s28 ; GFX900-NEXT: s_mov_b32 s15, s29 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -28026,12 +28044,12 @@ define void @s_shuffle_v2i64_v8i64__4_14() { ; GFX90A-NEXT: s_mov_b32 s14, s28 ; GFX90A-NEXT: s_mov_b32 s15, s29 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -28076,6 +28094,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() { ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[16:31] ; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b32 s8, s14 ; GFX900-NEXT: s_mov_b32 s9, s15 ; GFX900-NEXT: s_mov_b32 s10, s28 @@ -28084,7 +28103,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() { ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -28105,6 +28123,7 @@ define void @s_shuffle_v2i64_v8i64__5_14() { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def s[16:31] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b32 s8, s14 ; GFX90A-NEXT: s_mov_b32 s9, s15 ; GFX90A-NEXT: s_mov_b32 s10, s28 @@ -28113,7 +28132,6 @@ define void @s_shuffle_v2i64_v8i64__5_14() { ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -28237,6 +28255,7 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -28244,12 +28263,12 @@ define void @s_shuffle_v2i64_v8i64__6_14() { ; GFX942-NEXT: s_mov_b32 s14, s28 ; GFX942-NEXT: s_mov_b32 s15, s29 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -28352,22 +28371,22 @@ define void @s_shuffle_v2i64_v8i64__7_14() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_mov_b32 s8, s14 +; GFX942-NEXT: s_mov_b32 s9, s15 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND -; GFX942-NEXT: s_mov_b32 s8, s14 -; GFX942-NEXT: s_mov_b32 s9, s15 ; GFX942-NEXT: s_mov_b32 s10, s28 +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b32 s11, s29 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -28978,12 +28997,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() { ; GFX900-NEXT: s_mov_b32 s14, s30 ; GFX900-NEXT: s_mov_b32 s15, s31 ; GFX900-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -29007,12 +29026,12 @@ define void @s_shuffle_v2i64_v8i64__4_15() { ; GFX90A-NEXT: s_mov_b32 s14, s30 ; GFX90A-NEXT: s_mov_b32 s15, s31 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -29061,11 +29080,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() { ; GFX900-NEXT: s_mov_b32 s29, s15 ; GFX900-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX900-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s[8:11] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_readlane_b32 s31, v0, 1 -; GFX900-NEXT: v_readlane_b32 s30, v0, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] @@ -29090,11 +29109,11 @@ define void @s_shuffle_v2i64_v8i64__5_15() { ; GFX90A-NEXT: s_mov_b32 s29, s15 ; GFX90A-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX90A-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s[8:11] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v0, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v0, 0 ; GFX90A-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX90A-NEXT: s_mov_b64 exec, s[4:5] @@ -29219,6 +29238,7 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; def s[16:31] @@ -29226,12 +29246,12 @@ define void @s_shuffle_v2i64_v8i64__6_15() { ; GFX942-NEXT: s_mov_b32 s14, s30 ; GFX942-NEXT: s_mov_b32 s15, s31 ; GFX942-NEXT: s_mov_b64 s[8:9], s[12:13] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_mov_b64 s[10:11], s[14:15] ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] @@ -29334,22 +29354,23 @@ define void @s_shuffle_v2i64_v8i64__7_15() { ; GFX942-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: v_writelane_b32 v0, s30, 0 -; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[16:31] +; GFX942-NEXT: ; def s[0:15] ; GFX942-NEXT: ;;#ASMEND +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_writelane_b32 v0, s31, 1 ; GFX942-NEXT: ;;#ASMSTART -; GFX942-NEXT: ; def s[0:15] +; GFX942-NEXT: ; def s[16:31] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: s_mov_b32 s28, s14 ; GFX942-NEXT: s_mov_b32 s29, s15 ; GFX942-NEXT: s_mov_b64 s[8:9], s[28:29] ; GFX942-NEXT: s_mov_b64 s[10:11], s[30:31] +; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: ;;#ASMSTART ; GFX942-NEXT: ; use s[8:11] ; GFX942-NEXT: ;;#ASMEND ; GFX942-NEXT: v_readlane_b32 s31, v0, 1 -; GFX942-NEXT: v_readlane_b32 s30, v0, 0 ; GFX942-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX942-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload ; GFX942-NEXT: s_mov_b64 exec, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir index 1ffef8e60d90d..ea67593d72761 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir +++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir @@ -24,10 +24,10 @@ machineFunctionInfo: body: | bb.0: ; SGPR_SPILLED-LABEL: name: stack-slot-share-equal-sized-spills - ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62 + ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31 ; SGPR_SPILLED-NEXT: {{ $}} - ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62 - ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62 + ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31 ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]], implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 @@ -89,10 +89,10 @@ machineFunctionInfo: body: | bb.0: ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-large-spill-first - ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62 + ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31 ; SGPR_SPILLED-NEXT: {{ $}} - ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62 - ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62 + ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31 ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr2, 1, [[DEF]], implicit-def $sgpr2_sgpr3, implicit $sgpr2_sgpr3 @@ -152,10 +152,10 @@ machineFunctionInfo: body: | bb.0: ; SGPR_SPILLED-LABEL: name: stack-slot-share-unequal-sized-spills-with-small-spill-first - ; SGPR_SPILLED: liveins: $sgpr30, $sgpr31, $vgpr62 + ; SGPR_SPILLED: liveins: $vgpr62, $sgpr30_sgpr31 ; SGPR_SPILLED-NEXT: {{ $}} - ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr62 - ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62 + ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr62, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + ; SGPR_SPILLED-NEXT: $vgpr62 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr62, implicit killed $sgpr30_sgpr31 ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr32, 0, [[DEF]] ; SGPR_SPILLED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr0, 1, [[DEF]] diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index 00214ef36e1f0..d4b53e9d338f8 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -242,8 +242,8 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -425,8 +425,8 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -469,11 +469,11 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3 ; GCN-NEXT: v_mov_b32_e32 v1, v40 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s30, v42, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 ; GCN-NEXT: v_readlane_b32 s31, v42, 1 -; GCN-NEXT: v_readlane_b32 s30, v42, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s6, v42, 2 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 @@ -603,23 +603,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; FIJI-NEXT: s_mov_b64 exec, s[18:19] ; FIJI-NEXT: v_writelane_b32 v40, s16, 18 -; FIJI-NEXT: v_writelane_b32 v40, s30, 0 -; FIJI-NEXT: v_writelane_b32 v40, s31, 1 -; FIJI-NEXT: v_writelane_b32 v40, s34, 2 -; FIJI-NEXT: v_writelane_b32 v40, s35, 3 -; FIJI-NEXT: v_writelane_b32 v40, s36, 4 -; FIJI-NEXT: v_writelane_b32 v40, s37, 5 -; FIJI-NEXT: v_writelane_b32 v40, s38, 6 -; FIJI-NEXT: v_writelane_b32 v40, s39, 7 -; FIJI-NEXT: v_writelane_b32 v40, s48, 8 -; FIJI-NEXT: v_writelane_b32 v40, s49, 9 -; FIJI-NEXT: v_writelane_b32 v40, s50, 10 -; FIJI-NEXT: v_writelane_b32 v40, s51, 11 -; FIJI-NEXT: v_writelane_b32 v40, s52, 12 -; FIJI-NEXT: v_writelane_b32 v40, s53, 13 -; FIJI-NEXT: v_writelane_b32 v40, s54, 14 -; FIJI-NEXT: v_writelane_b32 v40, s55, 15 -; FIJI-NEXT: v_writelane_b32 v40, s64, 16 +; FIJI-NEXT: v_writelane_b32 v40, s34, 0 +; FIJI-NEXT: v_writelane_b32 v40, s35, 1 +; FIJI-NEXT: v_writelane_b32 v40, s36, 2 +; FIJI-NEXT: v_writelane_b32 v40, s37, 3 +; FIJI-NEXT: v_writelane_b32 v40, s38, 4 +; FIJI-NEXT: v_writelane_b32 v40, s39, 5 +; FIJI-NEXT: v_writelane_b32 v40, s48, 6 +; FIJI-NEXT: v_writelane_b32 v40, s49, 7 +; FIJI-NEXT: v_writelane_b32 v40, s50, 8 +; FIJI-NEXT: v_writelane_b32 v40, s51, 9 +; FIJI-NEXT: v_writelane_b32 v40, s52, 10 +; FIJI-NEXT: v_writelane_b32 v40, s53, 11 +; FIJI-NEXT: v_writelane_b32 v40, s54, 12 +; FIJI-NEXT: v_writelane_b32 v40, s55, 13 +; FIJI-NEXT: v_writelane_b32 v40, s64, 14 +; FIJI-NEXT: v_writelane_b32 v40, s65, 15 +; FIJI-NEXT: v_writelane_b32 v40, s30, 16 ; FIJI-NEXT: s_mov_b32 s50, s15 ; FIJI-NEXT: s_mov_b32 s51, s14 ; FIJI-NEXT: s_mov_b32 s52, s13 @@ -631,7 +631,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; FIJI-NEXT: s_mov_b64 s[54:55], exec ; FIJI-NEXT: s_addk_i32 s32, 0x400 -; FIJI-NEXT: v_writelane_b32 v40, s65, 17 +; FIJI-NEXT: v_writelane_b32 v40, s31, 17 ; FIJI-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; FIJI-NEXT: v_readfirstlane_b32 s16, v0 ; FIJI-NEXT: v_readfirstlane_b32 s17, v1 @@ -657,25 +657,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; FIJI-NEXT: s_cbranch_execnz .LBB18_1 ; FIJI-NEXT: ; %bb.2: ; FIJI-NEXT: s_mov_b64 exec, s[54:55] +; FIJI-NEXT: v_readlane_b32 s30, v40, 16 ; FIJI-NEXT: v_mov_b32_e32 v0, v4 -; FIJI-NEXT: v_readlane_b32 s65, v40, 17 -; FIJI-NEXT: v_readlane_b32 s64, v40, 16 -; FIJI-NEXT: v_readlane_b32 s55, v40, 15 -; FIJI-NEXT: v_readlane_b32 s54, v40, 14 -; FIJI-NEXT: v_readlane_b32 s53, v40, 13 -; FIJI-NEXT: v_readlane_b32 s52, v40, 12 -; FIJI-NEXT: v_readlane_b32 s51, v40, 11 -; FIJI-NEXT: v_readlane_b32 s50, v40, 10 -; FIJI-NEXT: v_readlane_b32 s49, v40, 9 -; FIJI-NEXT: v_readlane_b32 s48, v40, 8 -; FIJI-NEXT: v_readlane_b32 s39, v40, 7 -; FIJI-NEXT: v_readlane_b32 s38, v40, 6 -; FIJI-NEXT: v_readlane_b32 s37, v40, 5 -; FIJI-NEXT: v_readlane_b32 s36, v40, 4 -; FIJI-NEXT: v_readlane_b32 s35, v40, 3 -; FIJI-NEXT: v_readlane_b32 s34, v40, 2 -; FIJI-NEXT: v_readlane_b32 s31, v40, 1 -; FIJI-NEXT: v_readlane_b32 s30, v40, 0 +; FIJI-NEXT: v_readlane_b32 s31, v40, 17 +; FIJI-NEXT: v_readlane_b32 s65, v40, 15 +; FIJI-NEXT: v_readlane_b32 s64, v40, 14 +; FIJI-NEXT: v_readlane_b32 s55, v40, 13 +; FIJI-NEXT: v_readlane_b32 s54, v40, 12 +; FIJI-NEXT: v_readlane_b32 s53, v40, 11 +; FIJI-NEXT: v_readlane_b32 s52, v40, 10 +; FIJI-NEXT: v_readlane_b32 s51, v40, 9 +; FIJI-NEXT: v_readlane_b32 s50, v40, 8 +; FIJI-NEXT: v_readlane_b32 s49, v40, 7 +; FIJI-NEXT: v_readlane_b32 s48, v40, 6 +; FIJI-NEXT: v_readlane_b32 s39, v40, 5 +; FIJI-NEXT: v_readlane_b32 s38, v40, 4 +; FIJI-NEXT: v_readlane_b32 s37, v40, 3 +; FIJI-NEXT: v_readlane_b32 s36, v40, 2 +; FIJI-NEXT: v_readlane_b32 s35, v40, 1 +; FIJI-NEXT: v_readlane_b32 s34, v40, 0 ; FIJI-NEXT: s_mov_b32 s32, s33 ; FIJI-NEXT: v_readlane_b32 s4, v40, 18 ; FIJI-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -694,23 +694,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; HAWAII-NEXT: s_mov_b64 exec, s[18:19] ; HAWAII-NEXT: v_writelane_b32 v40, s16, 18 -; HAWAII-NEXT: v_writelane_b32 v40, s30, 0 -; HAWAII-NEXT: v_writelane_b32 v40, s31, 1 -; HAWAII-NEXT: v_writelane_b32 v40, s34, 2 -; HAWAII-NEXT: v_writelane_b32 v40, s35, 3 -; HAWAII-NEXT: v_writelane_b32 v40, s36, 4 -; HAWAII-NEXT: v_writelane_b32 v40, s37, 5 -; HAWAII-NEXT: v_writelane_b32 v40, s38, 6 -; HAWAII-NEXT: v_writelane_b32 v40, s39, 7 -; HAWAII-NEXT: v_writelane_b32 v40, s48, 8 -; HAWAII-NEXT: v_writelane_b32 v40, s49, 9 -; HAWAII-NEXT: v_writelane_b32 v40, s50, 10 -; HAWAII-NEXT: v_writelane_b32 v40, s51, 11 -; HAWAII-NEXT: v_writelane_b32 v40, s52, 12 -; HAWAII-NEXT: v_writelane_b32 v40, s53, 13 -; HAWAII-NEXT: v_writelane_b32 v40, s54, 14 -; HAWAII-NEXT: v_writelane_b32 v40, s55, 15 -; HAWAII-NEXT: v_writelane_b32 v40, s64, 16 +; HAWAII-NEXT: v_writelane_b32 v40, s34, 0 +; HAWAII-NEXT: v_writelane_b32 v40, s35, 1 +; HAWAII-NEXT: v_writelane_b32 v40, s36, 2 +; HAWAII-NEXT: v_writelane_b32 v40, s37, 3 +; HAWAII-NEXT: v_writelane_b32 v40, s38, 4 +; HAWAII-NEXT: v_writelane_b32 v40, s39, 5 +; HAWAII-NEXT: v_writelane_b32 v40, s48, 6 +; HAWAII-NEXT: v_writelane_b32 v40, s49, 7 +; HAWAII-NEXT: v_writelane_b32 v40, s50, 8 +; HAWAII-NEXT: v_writelane_b32 v40, s51, 9 +; HAWAII-NEXT: v_writelane_b32 v40, s52, 10 +; HAWAII-NEXT: v_writelane_b32 v40, s53, 11 +; HAWAII-NEXT: v_writelane_b32 v40, s54, 12 +; HAWAII-NEXT: v_writelane_b32 v40, s55, 13 +; HAWAII-NEXT: v_writelane_b32 v40, s64, 14 +; HAWAII-NEXT: v_writelane_b32 v40, s65, 15 +; HAWAII-NEXT: v_writelane_b32 v40, s30, 16 ; HAWAII-NEXT: s_mov_b32 s50, s15 ; HAWAII-NEXT: s_mov_b32 s51, s14 ; HAWAII-NEXT: s_mov_b32 s52, s13 @@ -722,7 +722,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; HAWAII-NEXT: s_mov_b64 s[54:55], exec ; HAWAII-NEXT: s_addk_i32 s32, 0x400 -; HAWAII-NEXT: v_writelane_b32 v40, s65, 17 +; HAWAII-NEXT: v_writelane_b32 v40, s31, 17 ; HAWAII-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; HAWAII-NEXT: v_readfirstlane_b32 s16, v0 ; HAWAII-NEXT: v_readfirstlane_b32 s17, v1 @@ -748,25 +748,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; HAWAII-NEXT: s_cbranch_execnz .LBB18_1 ; HAWAII-NEXT: ; %bb.2: ; HAWAII-NEXT: s_mov_b64 exec, s[54:55] +; HAWAII-NEXT: v_readlane_b32 s30, v40, 16 ; HAWAII-NEXT: v_mov_b32_e32 v0, v4 -; HAWAII-NEXT: v_readlane_b32 s65, v40, 17 -; HAWAII-NEXT: v_readlane_b32 s64, v40, 16 -; HAWAII-NEXT: v_readlane_b32 s55, v40, 15 -; HAWAII-NEXT: v_readlane_b32 s54, v40, 14 -; HAWAII-NEXT: v_readlane_b32 s53, v40, 13 -; HAWAII-NEXT: v_readlane_b32 s52, v40, 12 -; HAWAII-NEXT: v_readlane_b32 s51, v40, 11 -; HAWAII-NEXT: v_readlane_b32 s50, v40, 10 -; HAWAII-NEXT: v_readlane_b32 s49, v40, 9 -; HAWAII-NEXT: v_readlane_b32 s48, v40, 8 -; HAWAII-NEXT: v_readlane_b32 s39, v40, 7 -; HAWAII-NEXT: v_readlane_b32 s38, v40, 6 -; HAWAII-NEXT: v_readlane_b32 s37, v40, 5 -; HAWAII-NEXT: v_readlane_b32 s36, v40, 4 -; HAWAII-NEXT: v_readlane_b32 s35, v40, 3 -; HAWAII-NEXT: v_readlane_b32 s34, v40, 2 -; HAWAII-NEXT: v_readlane_b32 s31, v40, 1 -; HAWAII-NEXT: v_readlane_b32 s30, v40, 0 +; HAWAII-NEXT: v_readlane_b32 s31, v40, 17 +; HAWAII-NEXT: v_readlane_b32 s65, v40, 15 +; HAWAII-NEXT: v_readlane_b32 s64, v40, 14 +; HAWAII-NEXT: v_readlane_b32 s55, v40, 13 +; HAWAII-NEXT: v_readlane_b32 s54, v40, 12 +; HAWAII-NEXT: v_readlane_b32 s53, v40, 11 +; HAWAII-NEXT: v_readlane_b32 s52, v40, 10 +; HAWAII-NEXT: v_readlane_b32 s51, v40, 9 +; HAWAII-NEXT: v_readlane_b32 s50, v40, 8 +; HAWAII-NEXT: v_readlane_b32 s49, v40, 7 +; HAWAII-NEXT: v_readlane_b32 s48, v40, 6 +; HAWAII-NEXT: v_readlane_b32 s39, v40, 5 +; HAWAII-NEXT: v_readlane_b32 s38, v40, 4 +; HAWAII-NEXT: v_readlane_b32 s37, v40, 3 +; HAWAII-NEXT: v_readlane_b32 s36, v40, 2 +; HAWAII-NEXT: v_readlane_b32 s35, v40, 1 +; HAWAII-NEXT: v_readlane_b32 s34, v40, 0 ; HAWAII-NEXT: s_mov_b32 s32, s33 ; HAWAII-NEXT: v_readlane_b32 s4, v40, 18 ; HAWAII-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -785,23 +785,23 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-NEXT: v_writelane_b32 v40, s16, 18 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v40, s35, 3 -; GFX9-NEXT: v_writelane_b32 v40, s36, 4 -; GFX9-NEXT: v_writelane_b32 v40, s37, 5 -; GFX9-NEXT: v_writelane_b32 v40, s38, 6 -; GFX9-NEXT: v_writelane_b32 v40, s39, 7 -; GFX9-NEXT: v_writelane_b32 v40, s48, 8 -; GFX9-NEXT: v_writelane_b32 v40, s49, 9 -; GFX9-NEXT: v_writelane_b32 v40, s50, 10 -; GFX9-NEXT: v_writelane_b32 v40, s51, 11 -; GFX9-NEXT: v_writelane_b32 v40, s52, 12 -; GFX9-NEXT: v_writelane_b32 v40, s53, 13 -; GFX9-NEXT: v_writelane_b32 v40, s54, 14 -; GFX9-NEXT: v_writelane_b32 v40, s55, 15 -; GFX9-NEXT: v_writelane_b32 v40, s64, 16 +; GFX9-NEXT: v_writelane_b32 v40, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s35, 1 +; GFX9-NEXT: v_writelane_b32 v40, s36, 2 +; GFX9-NEXT: v_writelane_b32 v40, s37, 3 +; GFX9-NEXT: v_writelane_b32 v40, s38, 4 +; GFX9-NEXT: v_writelane_b32 v40, s39, 5 +; GFX9-NEXT: v_writelane_b32 v40, s48, 6 +; GFX9-NEXT: v_writelane_b32 v40, s49, 7 +; GFX9-NEXT: v_writelane_b32 v40, s50, 8 +; GFX9-NEXT: v_writelane_b32 v40, s51, 9 +; GFX9-NEXT: v_writelane_b32 v40, s52, 10 +; GFX9-NEXT: v_writelane_b32 v40, s53, 11 +; GFX9-NEXT: v_writelane_b32 v40, s54, 12 +; GFX9-NEXT: v_writelane_b32 v40, s55, 13 +; GFX9-NEXT: v_writelane_b32 v40, s64, 14 +; GFX9-NEXT: v_writelane_b32 v40, s65, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 16 ; GFX9-NEXT: s_mov_b32 s50, s15 ; GFX9-NEXT: s_mov_b32 s51, s14 ; GFX9-NEXT: s_mov_b32 s52, s13 @@ -813,7 +813,7 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: s_mov_b64 s[54:55], exec ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s65, 17 +; GFX9-NEXT: v_writelane_b32 v40, s31, 17 ; GFX9-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_readfirstlane_b32 s16, v0 ; GFX9-NEXT: v_readfirstlane_b32 s17, v1 @@ -839,25 +839,25 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr ; GFX9-NEXT: s_cbranch_execnz .LBB18_1 ; GFX9-NEXT: ; %bb.2: ; GFX9-NEXT: s_mov_b64 exec, s[54:55] +; GFX9-NEXT: v_readlane_b32 s30, v40, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-NEXT: v_readlane_b32 s65, v40, 17 -; GFX9-NEXT: v_readlane_b32 s64, v40, 16 -; GFX9-NEXT: v_readlane_b32 s55, v40, 15 -; GFX9-NEXT: v_readlane_b32 s54, v40, 14 -; GFX9-NEXT: v_readlane_b32 s53, v40, 13 -; GFX9-NEXT: v_readlane_b32 s52, v40, 12 -; GFX9-NEXT: v_readlane_b32 s51, v40, 11 -; GFX9-NEXT: v_readlane_b32 s50, v40, 10 -; GFX9-NEXT: v_readlane_b32 s49, v40, 9 -; GFX9-NEXT: v_readlane_b32 s48, v40, 8 -; GFX9-NEXT: v_readlane_b32 s39, v40, 7 -; GFX9-NEXT: v_readlane_b32 s38, v40, 6 -; GFX9-NEXT: v_readlane_b32 s37, v40, 5 -; GFX9-NEXT: v_readlane_b32 s36, v40, 4 -; GFX9-NEXT: v_readlane_b32 s35, v40, 3 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 17 +; GFX9-NEXT: v_readlane_b32 s65, v40, 15 +; GFX9-NEXT: v_readlane_b32 s64, v40, 14 +; GFX9-NEXT: v_readlane_b32 s55, v40, 13 +; GFX9-NEXT: v_readlane_b32 s54, v40, 12 +; GFX9-NEXT: v_readlane_b32 s53, v40, 11 +; GFX9-NEXT: v_readlane_b32 s52, v40, 10 +; GFX9-NEXT: v_readlane_b32 s51, v40, 9 +; GFX9-NEXT: v_readlane_b32 s50, v40, 8 +; GFX9-NEXT: v_readlane_b32 s49, v40, 7 +; GFX9-NEXT: v_readlane_b32 s48, v40, 6 +; GFX9-NEXT: v_readlane_b32 s39, v40, 5 +; GFX9-NEXT: v_readlane_b32 s38, v40, 4 +; GFX9-NEXT: v_readlane_b32 s37, v40, 3 +; GFX9-NEXT: v_readlane_b32 s36, v40, 2 +; GFX9-NEXT: v_readlane_b32 s35, v40, 1 +; GFX9-NEXT: v_readlane_b32 s34, v40, 0 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 18 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 52d9a30ffb8ba..1c9ee01807a5f 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -311,8 +311,8 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: s_mov_b32 s32, s34 ; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: v_readlane_b32 s34, v40, 3 diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index d2394bab82c77..06f1c08d1f04e 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -1283,11 +1283,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-OPT-NEXT: s_mov_b32 s32, s18 +; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 ; WAVE32-OPT-NEXT: ;;#ASMSTART ; WAVE32-OPT-NEXT: ; use s19 ; WAVE32-OPT-NEXT: ;;#ASMEND ; WAVE32-OPT-NEXT: v_readlane_b32 s31, v32, 1 -; WAVE32-OPT-NEXT: v_readlane_b32 s30, v32, 0 ; WAVE32-OPT-NEXT: s_mov_b32 s32, s33 ; WAVE32-OPT-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload @@ -1318,11 +1318,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-OPT-NEXT: s_mov_b32 s32, s18 +; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 ; WAVE64-OPT-NEXT: ;;#ASMSTART ; WAVE64-OPT-NEXT: ; use s19 ; WAVE64-OPT-NEXT: ;;#ASMEND ; WAVE64-OPT-NEXT: v_readlane_b32 s31, v32, 1 -; WAVE64-OPT-NEXT: v_readlane_b32 s30, v32, 0 ; WAVE64-OPT-NEXT: s_mov_b32 s32, s33 ; WAVE64-OPT-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-OPT-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload @@ -1431,8 +1431,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: ; use s5 ; WAVE32-O0-NEXT: ;;#ASMEND ; WAVE32-O0-NEXT: s_mov_b32 s32, s4 -; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE32-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE32-O0-NEXT: s_mov_b32 s32, s33 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload @@ -1542,8 +1542,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: ; use s5 ; WAVE64-O0-NEXT: ;;#ASMEND ; WAVE64-O0-NEXT: s_mov_b32 s32, s4 -; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-O0-NEXT: v_readlane_b32 s30, v32, 0 +; WAVE64-O0-NEXT: v_readlane_b32 s31, v32, 1 ; WAVE64-O0-NEXT: s_mov_b32 s32, s33 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; WAVE64-O0-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload @@ -1653,8 +1653,8 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: ; use s5 ; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMEND ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s4 -; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s30, v33, 0 +; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s31, v33, 1 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s4, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll index 5c83491e7289e..4232a09d9fc3a 100644 --- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -194,8 +194,8 @@ define void @outgoing_f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -230,8 +230,8 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 { ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -266,8 +266,8 @@ define void @outgoing_f16_return(ptr %ptr) #0 { ; GFX7-NEXT: v_mov_b32_e32 v40, v0 ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -309,8 +309,8 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 { ; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -381,8 +381,8 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 { ; GFX7-NEXT: flat_store_dword v[40:41], v4 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -468,8 +468,8 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 { ; GFX7-NEXT: flat_store_dword v[40:41], v8 ; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: v_readlane_b32 s30, v42, 0 +; GFX7-NEXT: v_readlane_b32 s31, v42, 1 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v42, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -518,6 +518,7 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -527,7 +528,6 @@ define half @call_split_type_used_outside_block_v8f16() #0 { ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: v_readlane_b32 s31, v40, 1 -; GFX7-NEXT: v_readlane_b32 s30, v40, 0 ; GFX7-NEXT: s_mov_b32 s32, s33 ; GFX7-NEXT: v_readlane_b32 s4, v40, 2 ; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll index 5c6fcd4f977e3..13cde61ff16a0 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll @@ -18,11 +18,12 @@ define void @test_load_zext() #0 { ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: s_mov_b32 s0, DescriptorBuffer@abs32@lo +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[2:3] -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s0, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[2:3], -1 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll index d76b2e5f2358d..837b66dadbba5 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll @@ -26,8 +26,8 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) #0 { ; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12 ; CHECK-NEXT: ; illegal copy v0 to s0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -62,8 +62,8 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) #0 { ; CHECK-NEXT: ; illegal copy v0 to s0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll index e78d62561238b..e5215fe1acdef 100644 --- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll @@ -646,29 +646,30 @@ define i32 @s_in_multiuse_A(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg ; GCN-NEXT: s_or_saveexec_b32 s16, -1 ; GCN-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b32 exec_lo, s16 -; GCN-NEXT: v_writelane_b32 v40, s2, 4 ; GCN-NEXT: s_add_i32 s32, s32, 16 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, use32@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, use32@gotpcrel32@hi+12 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: v_writelane_b32 v40, s2, 4 ; GCN-NEXT: s_load_b64 s[16:17], s[16:17], 0x0 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: s_mov_b32 s34, s1 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: s_and_b32 s35, s0, s3 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NEXT: v_writelane_b32 v40, s30, 2 +; GCN-NEXT: v_writelane_b32 v40, s31, 3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b32 s0, s35, s34 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_readlane_b32 s30, v40, 2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 3 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s0, v40, 4 ; GCN-NEXT: s_or_saveexec_b32 s1, -1 @@ -702,20 +703,21 @@ define i32 @s_in_multiuse_B(i32 inreg %x, i32 inreg %y, i32 inreg %z, i32 inreg ; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 +; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: s_mov_b32 s34, s1 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 +; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: s_and_b32 s35, s0, s3 +; GCN-NEXT: v_writelane_b32 v40, s30, 2 +; GCN-NEXT: v_writelane_b32 v40, s31, 3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_xor_b32 s0, s35, s34 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_readlane_b32 s30, v40, 2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: v_readlane_b32 s31, v40, 3 +; GCN-NEXT: v_readlane_b32 s35, v40, 1 +; GCN-NEXT: v_readlane_b32 s34, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s0, v40, 4 ; GCN-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 25e8581fb6cdd..639dcdcbf1c2a 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -14,22 +14,22 @@ define hidden void @widget() { ; GCN-NEXT: v_writelane_b32 v41, s16, 16 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v41, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s31, 1 -; GCN-NEXT: v_writelane_b32 v41, s34, 2 -; GCN-NEXT: v_writelane_b32 v41, s35, 3 -; GCN-NEXT: v_writelane_b32 v41, s36, 4 -; GCN-NEXT: v_writelane_b32 v41, s37, 5 -; GCN-NEXT: v_writelane_b32 v41, s38, 6 -; GCN-NEXT: v_writelane_b32 v41, s39, 7 -; GCN-NEXT: v_writelane_b32 v41, s48, 8 -; GCN-NEXT: v_writelane_b32 v41, s49, 9 -; GCN-NEXT: v_writelane_b32 v41, s50, 10 -; GCN-NEXT: v_writelane_b32 v41, s51, 11 -; GCN-NEXT: v_writelane_b32 v41, s52, 12 -; GCN-NEXT: v_writelane_b32 v41, s53, 13 -; GCN-NEXT: v_writelane_b32 v41, s54, 14 -; GCN-NEXT: v_writelane_b32 v41, s55, 15 +; GCN-NEXT: v_writelane_b32 v41, s34, 0 +; GCN-NEXT: v_writelane_b32 v41, s35, 1 +; GCN-NEXT: v_writelane_b32 v41, s36, 2 +; GCN-NEXT: v_writelane_b32 v41, s37, 3 +; GCN-NEXT: v_writelane_b32 v41, s38, 4 +; GCN-NEXT: v_writelane_b32 v41, s39, 5 +; GCN-NEXT: v_writelane_b32 v41, s48, 6 +; GCN-NEXT: v_writelane_b32 v41, s49, 7 +; GCN-NEXT: v_writelane_b32 v41, s50, 8 +; GCN-NEXT: v_writelane_b32 v41, s51, 9 +; GCN-NEXT: v_writelane_b32 v41, s52, 10 +; GCN-NEXT: v_writelane_b32 v41, s53, 11 +; GCN-NEXT: v_writelane_b32 v41, s54, 12 +; GCN-NEXT: v_writelane_b32 v41, s55, 13 +; GCN-NEXT: v_writelane_b32 v41, s30, 14 +; GCN-NEXT: v_writelane_b32 v41, s31, 15 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: flat_load_dword v0, v[0:1] @@ -93,22 +93,22 @@ define hidden void @widget() { ; GCN-NEXT: s_addc_u32 s17, s17, wibble@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock -; GCN-NEXT: v_readlane_b32 s55, v41, 15 -; GCN-NEXT: v_readlane_b32 s54, v41, 14 -; GCN-NEXT: v_readlane_b32 s53, v41, 13 -; GCN-NEXT: v_readlane_b32 s52, v41, 12 -; GCN-NEXT: v_readlane_b32 s51, v41, 11 -; GCN-NEXT: v_readlane_b32 s50, v41, 10 -; GCN-NEXT: v_readlane_b32 s49, v41, 9 -; GCN-NEXT: v_readlane_b32 s48, v41, 8 -; GCN-NEXT: v_readlane_b32 s39, v41, 7 -; GCN-NEXT: v_readlane_b32 s38, v41, 6 -; GCN-NEXT: v_readlane_b32 s37, v41, 5 -; GCN-NEXT: v_readlane_b32 s36, v41, 4 -; GCN-NEXT: v_readlane_b32 s35, v41, 3 -; GCN-NEXT: v_readlane_b32 s34, v41, 2 -; GCN-NEXT: v_readlane_b32 s31, v41, 1 -; GCN-NEXT: v_readlane_b32 s30, v41, 0 +; GCN-NEXT: v_readlane_b32 s30, v41, 14 +; GCN-NEXT: v_readlane_b32 s31, v41, 15 +; GCN-NEXT: v_readlane_b32 s55, v41, 13 +; GCN-NEXT: v_readlane_b32 s54, v41, 12 +; GCN-NEXT: v_readlane_b32 s53, v41, 11 +; GCN-NEXT: v_readlane_b32 s52, v41, 10 +; GCN-NEXT: v_readlane_b32 s51, v41, 9 +; GCN-NEXT: v_readlane_b32 s50, v41, 8 +; GCN-NEXT: v_readlane_b32 s49, v41, 7 +; GCN-NEXT: v_readlane_b32 s48, v41, 6 +; GCN-NEXT: v_readlane_b32 s39, v41, 5 +; GCN-NEXT: v_readlane_b32 s38, v41, 4 +; GCN-NEXT: v_readlane_b32 s37, v41, 3 +; GCN-NEXT: v_readlane_b32 s36, v41, 2 +; GCN-NEXT: v_readlane_b32 s35, v41, 1 +; GCN-NEXT: v_readlane_b32 s34, v41, 0 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v41, 16 @@ -266,32 +266,32 @@ define hidden void @blam() { ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v45, s30, 0 -; GCN-NEXT: v_writelane_b32 v45, s31, 1 -; GCN-NEXT: v_writelane_b32 v45, s34, 2 -; GCN-NEXT: v_writelane_b32 v45, s35, 3 -; GCN-NEXT: v_writelane_b32 v45, s36, 4 -; GCN-NEXT: v_writelane_b32 v45, s37, 5 -; GCN-NEXT: v_writelane_b32 v45, s38, 6 -; GCN-NEXT: v_writelane_b32 v45, s39, 7 -; GCN-NEXT: v_writelane_b32 v45, s48, 8 -; GCN-NEXT: v_writelane_b32 v45, s49, 9 -; GCN-NEXT: v_writelane_b32 v45, s50, 10 -; GCN-NEXT: v_writelane_b32 v45, s51, 11 -; GCN-NEXT: v_writelane_b32 v45, s52, 12 -; GCN-NEXT: v_writelane_b32 v45, s53, 13 -; GCN-NEXT: v_writelane_b32 v45, s54, 14 -; GCN-NEXT: v_writelane_b32 v45, s55, 15 -; GCN-NEXT: v_writelane_b32 v45, s64, 16 -; GCN-NEXT: v_writelane_b32 v45, s65, 17 -; GCN-NEXT: v_writelane_b32 v45, s66, 18 -; GCN-NEXT: v_writelane_b32 v45, s67, 19 -; GCN-NEXT: v_writelane_b32 v45, s68, 20 -; GCN-NEXT: v_writelane_b32 v45, s69, 21 -; GCN-NEXT: v_writelane_b32 v45, s70, 22 -; GCN-NEXT: v_writelane_b32 v45, s71, 23 -; GCN-NEXT: v_writelane_b32 v45, s80, 24 -; GCN-NEXT: v_writelane_b32 v45, s81, 25 +; GCN-NEXT: v_writelane_b32 v45, s34, 0 +; GCN-NEXT: v_writelane_b32 v45, s35, 1 +; GCN-NEXT: v_writelane_b32 v45, s36, 2 +; GCN-NEXT: v_writelane_b32 v45, s37, 3 +; GCN-NEXT: v_writelane_b32 v45, s38, 4 +; GCN-NEXT: v_writelane_b32 v45, s39, 5 +; GCN-NEXT: v_writelane_b32 v45, s48, 6 +; GCN-NEXT: v_writelane_b32 v45, s49, 7 +; GCN-NEXT: v_writelane_b32 v45, s50, 8 +; GCN-NEXT: v_writelane_b32 v45, s51, 9 +; GCN-NEXT: v_writelane_b32 v45, s52, 10 +; GCN-NEXT: v_writelane_b32 v45, s53, 11 +; GCN-NEXT: v_writelane_b32 v45, s54, 12 +; GCN-NEXT: v_writelane_b32 v45, s55, 13 +; GCN-NEXT: v_writelane_b32 v45, s64, 14 +; GCN-NEXT: v_writelane_b32 v45, s65, 15 +; GCN-NEXT: v_writelane_b32 v45, s66, 16 +; GCN-NEXT: v_writelane_b32 v45, s67, 17 +; GCN-NEXT: v_writelane_b32 v45, s68, 18 +; GCN-NEXT: v_writelane_b32 v45, s69, 19 +; GCN-NEXT: v_writelane_b32 v45, s70, 20 +; GCN-NEXT: v_writelane_b32 v45, s71, 21 +; GCN-NEXT: v_writelane_b32 v45, s80, 22 +; GCN-NEXT: v_writelane_b32 v45, s81, 23 +; GCN-NEXT: v_writelane_b32 v45, s30, 24 +; GCN-NEXT: v_writelane_b32 v45, s31, 25 ; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_mov_b32 s54, s15 ; GCN-NEXT: s_mov_b32 s55, s14 @@ -427,32 +427,32 @@ define hidden void @blam() { ; GCN-NEXT: s_branch .LBB1_1 ; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock ; GCN-NEXT: s_or_b64 exec, exec, s[66:67] -; GCN-NEXT: v_readlane_b32 s81, v45, 25 -; GCN-NEXT: v_readlane_b32 s80, v45, 24 -; GCN-NEXT: v_readlane_b32 s71, v45, 23 -; GCN-NEXT: v_readlane_b32 s70, v45, 22 -; GCN-NEXT: v_readlane_b32 s69, v45, 21 -; GCN-NEXT: v_readlane_b32 s68, v45, 20 -; GCN-NEXT: v_readlane_b32 s67, v45, 19 -; GCN-NEXT: v_readlane_b32 s66, v45, 18 -; GCN-NEXT: v_readlane_b32 s65, v45, 17 -; GCN-NEXT: v_readlane_b32 s64, v45, 16 -; GCN-NEXT: v_readlane_b32 s55, v45, 15 -; GCN-NEXT: v_readlane_b32 s54, v45, 14 -; GCN-NEXT: v_readlane_b32 s53, v45, 13 -; GCN-NEXT: v_readlane_b32 s52, v45, 12 -; GCN-NEXT: v_readlane_b32 s51, v45, 11 -; GCN-NEXT: v_readlane_b32 s50, v45, 10 -; GCN-NEXT: v_readlane_b32 s49, v45, 9 -; GCN-NEXT: v_readlane_b32 s48, v45, 8 -; GCN-NEXT: v_readlane_b32 s39, v45, 7 -; GCN-NEXT: v_readlane_b32 s38, v45, 6 -; GCN-NEXT: v_readlane_b32 s37, v45, 5 -; GCN-NEXT: v_readlane_b32 s36, v45, 4 -; GCN-NEXT: v_readlane_b32 s35, v45, 3 -; GCN-NEXT: v_readlane_b32 s34, v45, 2 -; GCN-NEXT: v_readlane_b32 s31, v45, 1 -; GCN-NEXT: v_readlane_b32 s30, v45, 0 +; GCN-NEXT: v_readlane_b32 s30, v45, 24 +; GCN-NEXT: v_readlane_b32 s31, v45, 25 +; GCN-NEXT: v_readlane_b32 s81, v45, 23 +; GCN-NEXT: v_readlane_b32 s80, v45, 22 +; GCN-NEXT: v_readlane_b32 s71, v45, 21 +; GCN-NEXT: v_readlane_b32 s70, v45, 20 +; GCN-NEXT: v_readlane_b32 s69, v45, 19 +; GCN-NEXT: v_readlane_b32 s68, v45, 18 +; GCN-NEXT: v_readlane_b32 s67, v45, 17 +; GCN-NEXT: v_readlane_b32 s66, v45, 16 +; GCN-NEXT: v_readlane_b32 s65, v45, 15 +; GCN-NEXT: v_readlane_b32 s64, v45, 14 +; GCN-NEXT: v_readlane_b32 s55, v45, 13 +; GCN-NEXT: v_readlane_b32 s54, v45, 12 +; GCN-NEXT: v_readlane_b32 s53, v45, 11 +; GCN-NEXT: v_readlane_b32 s52, v45, 10 +; GCN-NEXT: v_readlane_b32 s51, v45, 9 +; GCN-NEXT: v_readlane_b32 s50, v45, 8 +; GCN-NEXT: v_readlane_b32 s49, v45, 7 +; GCN-NEXT: v_readlane_b32 s48, v45, 6 +; GCN-NEXT: v_readlane_b32 s39, v45, 5 +; GCN-NEXT: v_readlane_b32 s38, v45, 4 +; GCN-NEXT: v_readlane_b32 s37, v45, 3 +; GCN-NEXT: v_readlane_b32 s36, v45, 2 +; GCN-NEXT: v_readlane_b32 s35, v45, 1 +; GCN-NEXT: v_readlane_b32 s34, v45, 0 ; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 9ce8859c410fc..c902ac6173935 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -52,8 +52,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: v_readlane_b32 s30, v44, 0 +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v44, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -109,8 +109,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: v_readlane_b32 s30, v44, 0 +; GFX10-NEXT: v_readlane_b32 s31, v44, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s4, v44, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 @@ -163,8 +163,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 -; GFX11-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-NEXT: v_readlane_b32 s30, v44, 0 +; GFX11-NEXT: v_readlane_b32 s31, v44, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v44, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 @@ -236,8 +236,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v45, 1 ; GFX9-NEXT: v_readlane_b32 s30, v45, 0 +; GFX9-NEXT: v_readlane_b32 s31, v45, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v45, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -286,8 +286,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s31, v45, 1 ; GFX10-NEXT: v_readlane_b32 s30, v45, 0 +; GFX10-NEXT: v_readlane_b32 s31, v45, 1 ; GFX10-NEXT: s_mov_b32 s32, s33 ; GFX10-NEXT: v_readlane_b32 s4, v45, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 @@ -335,8 +335,8 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s31, v45, 1 ; GFX11-NEXT: v_readlane_b32 s30, v45, 0 +; GFX11-NEXT: v_readlane_b32 s31, v45, 1 ; GFX11-NEXT: s_mov_b32 s32, s33 ; GFX11-NEXT: v_readlane_b32 s0, v45, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 1805add14e47a..a76f4495f85d5 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -3086,8 +3086,8 @@ define void @callee_no_stack_with_call() #1 { ; GFX1032-NEXT: v_writelane_b32 v40, s31, 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1032-NEXT: v_readlane_b32 s30, v40, 0 +; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1032-NEXT: s_mov_b32 s32, s33 ; GFX1032-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1032-NEXT: s_or_saveexec_b32 s5, -1 @@ -3117,8 +3117,8 @@ define void @callee_no_stack_with_call() #1 { ; GFX1064-NEXT: v_writelane_b32 v40, s31, 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1064-NEXT: v_readlane_b32 s30, v40, 0 +; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1064-NEXT: s_mov_b32 s32, s33 ; GFX1064-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index 50195bc5bf414..087eecbfda19e 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -1593,8 +1593,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL-NEXT: s_wait_alu 0xfffe ; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 ; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1 +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 ; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0 ; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3 ; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload @@ -1929,8 +1929,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL-NEXT: s_wait_alu 0xfffe ; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_readlane_b32 s31, v40, 2 ; GISEL-NEXT: v_readlane_b32 s30, v40, 1 +; GISEL-NEXT: v_readlane_b32 s31, v40, 2 ; GISEL-NEXT: v_readlane_b32 s4, v40, 0 ; GISEL-NEXT: v_readlane_b32 s0, v40, 3 ; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload @@ -2266,8 +2266,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; DAGISEL64-NEXT: s_wait_alu 0xfffe ; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3 ; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3 ; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1 ; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0 ; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4 @@ -2604,8 +2604,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GISEL64-NEXT: s_wait_alu 0xfffe ; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL64-NEXT: v_readlane_b32 s31, v40, 3 ; GISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; GISEL64-NEXT: v_readlane_b32 s31, v40, 3 ; GISEL64-NEXT: v_readlane_b32 s5, v40, 1 ; GISEL64-NEXT: v_readlane_b32 s4, v40, 0 ; GISEL64-NEXT: v_readlane_b32 s0, v40, 4 @@ -3719,8 +3719,8 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ; GFX1250-DAGISEL-NEXT: v_writelane_b32 v40, s31, 2 ; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1] ; GFX1250-DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v40, 1 +; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v40, 0 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v40, 3 ; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload @@ -8049,8 +8049,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL-NEXT: s_wait_alu 0xfffe ; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL-NEXT: flat_store_b32 v[40:41], v0 -; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2 ; DAGISEL-NEXT: v_readlane_b32 s30, v42, 1 +; DAGISEL-NEXT: v_readlane_b32 s31, v42, 2 ; DAGISEL-NEXT: v_readlane_b32 s4, v42, 0 ; DAGISEL-NEXT: v_readlane_b32 s0, v42, 3 ; DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload @@ -8390,8 +8390,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL-NEXT: s_wait_alu 0xfffe ; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL-NEXT: flat_store_b32 v[40:41], v0 -; GISEL-NEXT: v_readlane_b32 s31, v42, 2 ; GISEL-NEXT: v_readlane_b32 s30, v42, 1 +; GISEL-NEXT: v_readlane_b32 s31, v42, 2 ; GISEL-NEXT: v_readlane_b32 s4, v42, 0 ; GISEL-NEXT: v_readlane_b32 s0, v42, 3 ; GISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload @@ -8733,8 +8733,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; DAGISEL64-NEXT: s_wait_alu 0xfffe ; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL64-NEXT: flat_store_b32 v[40:41], v0 -; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3 ; DAGISEL64-NEXT: v_readlane_b32 s30, v42, 2 +; DAGISEL64-NEXT: v_readlane_b32 s31, v42, 3 ; DAGISEL64-NEXT: v_readlane_b32 s5, v42, 1 ; DAGISEL64-NEXT: v_readlane_b32 s4, v42, 0 ; DAGISEL64-NEXT: v_readlane_b32 s0, v42, 4 @@ -9077,8 +9077,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GISEL64-NEXT: s_wait_alu 0xfffe ; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL64-NEXT: flat_store_b32 v[40:41], v0 -; GISEL64-NEXT: v_readlane_b32 s31, v42, 3 ; GISEL64-NEXT: v_readlane_b32 s30, v42, 2 +; GISEL64-NEXT: v_readlane_b32 s31, v42, 3 ; GISEL64-NEXT: v_readlane_b32 s5, v42, 1 ; GISEL64-NEXT: v_readlane_b32 s4, v42, 0 ; GISEL64-NEXT: v_readlane_b32 s0, v42, 4 @@ -10198,8 +10198,8 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float> ; GFX1250-DAGISEL-NEXT: v_writelane_b32 v42, s31, 2 ; GFX1250-DAGISEL-NEXT: s_swap_pc_i64 s[30:31], s[0:1] ; GFX1250-DAGISEL-NEXT: flat_store_b32 v[40:41], v0 -; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v42, 2 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s30, v42, 1 +; GFX1250-DAGISEL-NEXT: v_readlane_b32 s31, v42, 2 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s4, v42, 0 ; GFX1250-DAGISEL-NEXT: v_readlane_b32 s0, v42, 3 ; GFX1250-DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll index 06c451869e841..3fe54cd045c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -41,12 +41,12 @@ define void @vector_reg_liverange_split() #0 { ; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 ; GFX90A-NEXT: v_accvgpr_read_b32 v39, a32 ; GFX90A-NEXT: s_mov_b64 exec, s[28:29] +; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 ; GFX90A-NEXT: v_readlane_b32 s20, v39, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use s20 ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 ; GFX90A-NEXT: s_mov_b32 s32, s33 ; GFX90A-NEXT: v_readlane_b32 s4, v40, 4 ; GFX90A-NEXT: v_readlane_b32 s28, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 9e9fe1809c780..b5c0023bd5c2a 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -49,10 +49,10 @@ define void @test() #0 { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v39, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: s_mov_b32 s32, s33 ; GCN-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-NEXT: v_readlane_b32 s28, v40, 2 @@ -111,8 +111,8 @@ define void @test() #0 { ; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 ; GCN-O0-NEXT: global_store_dword v[0:1], v2, off ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-O0-NEXT: s_mov_b32 s32, s33 ; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 ; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e9a0671ead4e0..fe641367944be 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -387,8 +387,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 ; GFX9-O0-NEXT: s_mov_b32 s32, s33 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload @@ -424,9 +424,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O3-NEXT: s_mov_b32 s32, s33 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -622,8 +622,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: s_mov_b32 s32, s33 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload @@ -683,9 +683,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 ; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 -; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 ; GFX9-O3-NEXT: s_mov_b32 s32, s33 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload