diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7cb7f47ddb220..630fdc8e8891c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2094,11 +2094,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; case AMDGPU::SI_SPILL_S32_TO_VGPR: - MI.setDesc(get(AMDGPU::V_WRITELANE_B32)); + mutateAndCleanupImplicit(MI, get(AMDGPU::V_WRITELANE_B32)); break; case AMDGPU::SI_RESTORE_S32_FROM_VGPR: - MI.setDesc(get(AMDGPU::V_READLANE_B32)); + mutateAndCleanupImplicit(MI, get(AMDGPU::V_READLANE_B32)); break; case AMDGPU::AV_MOV_B32_IMM_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 8879ef5c8265d..d965a3dbcc8a4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -8181,8 +8181,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 0 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 @@ -8215,8 +8215,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 @@ -8249,8 +8249,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 @@ -8283,8 +8283,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 @@ -8317,8 +8317,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 @@ -8350,8 +8350,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 @@ -8384,8 +8384,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v23, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v23, 37 ; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v23, 37 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v23, 38 ; SI-NEXT: v_readlane_b32 s7, v23, 39 @@ -8468,148 +8468,149 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s50, 0 -; SI-NEXT: v_writelane_b32 v23, s51, 1 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 1 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 2 -; SI-NEXT: v_writelane_b32 v23, s51, 3 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 2 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 3 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 4 -; SI-NEXT: v_writelane_b32 v23, s51, 5 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 4 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 5 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 6 -; SI-NEXT: v_writelane_b32 v23, s51, 7 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 6 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 7 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 8 -; SI-NEXT: v_writelane_b32 v23, s51, 9 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 8 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 9 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 10 -; SI-NEXT: v_writelane_b32 v23, s51, 11 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 10 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 11 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 12 -; SI-NEXT: v_writelane_b32 v23, s51, 13 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 12 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 13 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 14 -; SI-NEXT: v_writelane_b32 v23, s51, 15 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 14 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 15 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 16 -; SI-NEXT: v_writelane_b32 v23, s51, 17 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 16 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 17 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 18 -; SI-NEXT: v_writelane_b32 v23, s51, 19 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 18 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 19 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 20 -; SI-NEXT: v_writelane_b32 v23, s51, 21 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 20 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 21 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 22 -; SI-NEXT: v_writelane_b32 v23, s51, 23 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 22 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 23 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 24 -; SI-NEXT: v_writelane_b32 v23, s51, 25 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 24 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 25 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 26 -; SI-NEXT: v_writelane_b32 v23, s51, 27 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 27 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 28 -; SI-NEXT: v_writelane_b32 v23, s51, 29 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 28 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 29 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 30 -; SI-NEXT: v_writelane_b32 v23, s51, 31 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 30 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 31 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 32 -; SI-NEXT: v_writelane_b32 v23, s51, 33 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 32 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 33 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 34 -; SI-NEXT: v_writelane_b32 v23, s51, 35 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 34 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 35 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 36 -; SI-NEXT: v_writelane_b32 v23, s51, 37 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 36 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 37 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: v_writelane_b32 v23, s50, 38 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s51, 39 -; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr54 @@ -8634,7 +8635,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -10597,10 +10597,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: v_writelane_b32 v22, s82, 0 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 @@ -10633,6 +10632,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 ; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -10985,12 +10985,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB13_3 ; GFX11-NEXT: .LBB13_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -11001,7 +10999,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11013,6 +11010,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11024,7 +11022,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11036,6 +11033,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11044,10 +11042,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: s_mov_b32 s101, -1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr30 @@ -11111,17 +11111,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB13_3: ; %Flow @@ -45248,147 +45248,149 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v37 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s60, 0 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: v_writelane_b32 v61, s61, 1 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 2 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 2 ; SI-NEXT: v_writelane_b32 v61, s61, 3 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 4 ; SI-NEXT: v_writelane_b32 v61, s61, 5 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 6 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 6 ; SI-NEXT: v_writelane_b32 v61, s61, 7 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 8 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 8 ; SI-NEXT: v_writelane_b32 v61, s61, 9 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 10 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 10 ; SI-NEXT: v_writelane_b32 v61, s61, 11 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 12 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 12 ; SI-NEXT: v_writelane_b32 v61, s61, 13 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 14 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 14 ; SI-NEXT: v_writelane_b32 v61, s61, 15 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 16 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 16 ; SI-NEXT: v_writelane_b32 v61, s61, 17 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 18 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 18 ; SI-NEXT: v_writelane_b32 v61, s61, 19 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 20 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 20 ; SI-NEXT: v_writelane_b32 v61, s61, 21 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 22 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 22 ; SI-NEXT: v_writelane_b32 v61, s61, 23 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 24 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 24 ; SI-NEXT: v_writelane_b32 v61, s61, 25 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 26 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 26 ; SI-NEXT: v_writelane_b32 v61, s61, 27 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 28 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 28 ; SI-NEXT: v_writelane_b32 v61, s61, 29 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 30 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 30 ; SI-NEXT: v_writelane_b32 v61, s61, 31 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 32 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 32 ; SI-NEXT: v_writelane_b32 v61, s61, 33 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 34 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 34 ; SI-NEXT: v_writelane_b32 v61, s61, 35 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 36 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 36 ; SI-NEXT: v_writelane_b32 v61, s61, 37 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 38 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 38 ; SI-NEXT: v_writelane_b32 v61, s61, 39 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 40 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr28 @@ -45397,8 +45399,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 @@ -45570,6 +45570,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v55, s97 ; SI-NEXT: v_mov_b32_e32 v54, s96 ; SI-NEXT: v_mov_b32_e32 v52, s60 ; SI-NEXT: v_mov_b32_e32 v47, s28 @@ -45590,48 +45591,47 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v31, s46 ; SI-NEXT: v_mov_b32_e32 v36, s56 ; SI-NEXT: v_readlane_b32 s26, v61, 40 +; SI-NEXT: v_readlane_b32 s27, v61, 41 ; SI-NEXT: v_readlane_b32 s28, v61, 38 +; SI-NEXT: v_readlane_b32 s29, v61, 39 ; SI-NEXT: v_readlane_b32 s6, v61, 36 +; SI-NEXT: v_readlane_b32 s7, v61, 37 ; SI-NEXT: v_readlane_b32 s58, v61, 34 +; SI-NEXT: v_readlane_b32 s59, v61, 35 ; SI-NEXT: v_readlane_b32 s60, v61, 32 +; SI-NEXT: v_readlane_b32 s61, v61, 33 ; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: v_readlane_b32 s10, v61, 28 +; SI-NEXT: v_readlane_b32 s11, v61, 29 ; SI-NEXT: v_readlane_b32 s12, v61, 26 +; SI-NEXT: v_readlane_b32 s13, v61, 27 ; SI-NEXT: v_readlane_b32 s14, v61, 24 +; SI-NEXT: v_readlane_b32 s15, v61, 25 ; SI-NEXT: v_readlane_b32 s16, v61, 22 +; SI-NEXT: v_readlane_b32 s17, v61, 23 ; SI-NEXT: s_mov_b32 s96, s94 ; SI-NEXT: v_readlane_b32 s94, v61, 20 -; SI-NEXT: v_readlane_b32 s18, v61, 18 -; SI-NEXT: v_readlane_b32 s20, v61, 16 -; SI-NEXT: v_readlane_b32 s22, v61, 14 -; SI-NEXT: v_readlane_b32 s24, v61, 12 -; SI-NEXT: v_readlane_b32 s40, v61, 10 -; SI-NEXT: v_readlane_b32 s42, v61, 8 -; SI-NEXT: v_readlane_b32 s44, v61, 6 -; SI-NEXT: v_readlane_b32 s46, v61, 4 -; SI-NEXT: v_readlane_b32 s56, v61, 2 -; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0 -; SI-NEXT: v_mov_b32_e32 v55, s97 -; SI-NEXT: v_readlane_b32 s27, v61, 41 -; SI-NEXT: v_readlane_b32 s29, v61, 39 -; SI-NEXT: v_readlane_b32 s7, v61, 37 -; SI-NEXT: v_readlane_b32 s59, v61, 35 -; SI-NEXT: v_readlane_b32 s61, v61, 33 -; SI-NEXT: v_readlane_b32 s9, v61, 31 -; SI-NEXT: v_readlane_b32 s11, v61, 29 -; SI-NEXT: v_readlane_b32 s13, v61, 27 -; SI-NEXT: v_readlane_b32 s15, v61, 25 -; SI-NEXT: v_readlane_b32 s17, v61, 23 ; SI-NEXT: v_readlane_b32 s95, v61, 21 +; SI-NEXT: v_readlane_b32 s18, v61, 18 ; SI-NEXT: v_readlane_b32 s19, v61, 19 +; SI-NEXT: v_readlane_b32 s20, v61, 16 ; SI-NEXT: v_readlane_b32 s21, v61, 17 +; SI-NEXT: v_readlane_b32 s22, v61, 14 ; SI-NEXT: v_readlane_b32 s23, v61, 15 +; SI-NEXT: v_readlane_b32 s24, v61, 12 ; SI-NEXT: v_readlane_b32 s25, v61, 13 +; SI-NEXT: v_readlane_b32 s40, v61, 10 ; SI-NEXT: v_readlane_b32 s41, v61, 11 +; SI-NEXT: v_readlane_b32 s42, v61, 8 ; SI-NEXT: v_readlane_b32 s43, v61, 9 +; SI-NEXT: v_readlane_b32 s44, v61, 6 ; SI-NEXT: v_readlane_b32 s45, v61, 7 +; SI-NEXT: v_readlane_b32 s46, v61, 4 ; SI-NEXT: v_readlane_b32 s47, v61, 5 +; SI-NEXT: v_readlane_b32 s56, v61, 2 ; SI-NEXT: v_readlane_b32 s57, v61, 3 +; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0 ; SI-NEXT: v_readlane_b32 vcc_hi, v61, 1 ; SI-NEXT: .LBB37_5: ; %end ; SI-NEXT: s_waitcnt vmcnt(14) @@ -82666,8 +82666,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 0 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 @@ -82700,8 +82700,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 @@ -82734,8 +82734,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 @@ -82768,8 +82768,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 @@ -82802,8 +82802,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 @@ -82836,8 +82836,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 @@ -82955,8 +82955,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s54, 0 ; SI-NEXT: ; implicit-def: $sgpr26 @@ -82965,172 +82963,174 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s54, 2 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s55, 3 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: v_writelane_b32 v23, s54, 4 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 5 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 5 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 6 -; SI-NEXT: v_writelane_b32 v23, s55, 7 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 6 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 7 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 8 -; SI-NEXT: v_writelane_b32 v23, s55, 9 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 9 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 10 -; SI-NEXT: v_writelane_b32 v23, s55, 11 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 11 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 12 -; SI-NEXT: v_writelane_b32 v23, s55, 13 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 12 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 13 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 14 -; SI-NEXT: v_writelane_b32 v23, s55, 15 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 14 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 15 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 16 -; SI-NEXT: v_writelane_b32 v23, s55, 17 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 16 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 17 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 18 -; SI-NEXT: v_writelane_b32 v23, s55, 19 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 18 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 19 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 20 -; SI-NEXT: v_writelane_b32 v23, s55, 21 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 20 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 21 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 22 -; SI-NEXT: v_writelane_b32 v23, s55, 23 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 22 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 23 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 24 -; SI-NEXT: v_writelane_b32 v23, s55, 25 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 24 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 25 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 26 -; SI-NEXT: v_writelane_b32 v23, s55, 27 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 27 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 28 -; SI-NEXT: v_writelane_b32 v23, s55, 29 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 28 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 29 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 30 -; SI-NEXT: v_writelane_b32 v23, s55, 31 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 30 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 31 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 32 -; SI-NEXT: v_writelane_b32 v23, s55, 33 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 32 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 33 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 34 -; SI-NEXT: v_writelane_b32 v23, s55, 35 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 34 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 35 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 36 -; SI-NEXT: v_writelane_b32 v23, s55, 37 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 36 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 37 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: v_writelane_b32 v23, s54, 38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: v_writelane_b32 v23, s55, 39 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -85081,10 +85081,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: v_writelane_b32 v22, s82, 0 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 @@ -85117,6 +85116,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 ; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -85469,12 +85469,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB57_3 ; GFX11-NEXT: .LBB57_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -85485,7 +85483,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -85497,6 +85494,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -85508,7 +85506,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -85520,6 +85517,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -85528,10 +85526,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: s_mov_b32 s101, -1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr30 @@ -85595,17 +85595,17 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB57_3: ; %Flow @@ -117783,8 +117783,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 0 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117792,37 +117790,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 2 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117830,8 +117797,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 4 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117839,8 +117804,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 6 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117848,8 +117811,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117857,8 +117818,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117866,8 +117825,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 12 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117875,8 +117832,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 14 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117884,8 +117839,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 16 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117893,8 +117846,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 18 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117902,8 +117853,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 20 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117911,8 +117860,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 22 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -117920,44 +117867,97 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 24 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s49, 25 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: v_writelane_b32 v62, s48, 26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s49, 27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v62, s48, 28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 29 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v62, s48, 30 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s48, 28 -; SI-NEXT: v_writelane_b32 v62, s49, 29 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 31 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v62, s48, 32 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s48, 30 -; SI-NEXT: v_writelane_b32 v62, s49, 31 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s48, 32 -; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: ; SI-NEXT: v_mov_b32_e32 v1, s38 @@ -118013,9 +118013,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v16, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 30 ; SI-NEXT: v_readlane_b32 s27, v62, 31 +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_mov_b32_e32 v51, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: v_mov_b32_e32 v38, s72 ; SI-NEXT: v_mov_b32_e32 v49, s62 @@ -167832,8 +167832,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshr_b32 s13, s4, 16 ; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: v_writelane_b32 v61, s5, 27 ; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: v_writelane_b32 v61, s5, 27 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 @@ -168027,191 +168027,92 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v59, v51 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v46 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v22 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v7, v37 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: v_mov_b32_e32 v44, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v7, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v52, v17 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v43, v20 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v42, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v5 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v35, v6 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v12 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v19, v39 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v39, v25 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v29 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v20, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v6, v55 ; SI-NEXT: v_writelane_b32 v62, s4, 44 ; SI-NEXT: v_writelane_b32 v62, s5, 45 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v8 ; SI-NEXT: v_writelane_b32 v62, s4, 46 ; SI-NEXT: v_writelane_b32 v62, s5, 47 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v29, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 48 ; SI-NEXT: v_writelane_b32 v62, s5, 49 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: v_writelane_b32 v62, s4, 50 ; SI-NEXT: v_writelane_b32 v62, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -168231,7 +168132,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v62, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 62 -; SI-NEXT: v_writelane_b32 v62, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 0 ; SI-NEXT: v_writelane_b32 v61, s5, 1 @@ -168271,18 +168171,118 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: v_writelane_b32 v61, s5, 25 -; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: v_writelane_b32 v61, s20, 28 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: v_writelane_b32 v61, s21, 29 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s20, 30 ; SI-NEXT: v_writelane_b32 v61, s21, 31 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: v_writelane_b32 v61, s88, 32 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: v_writelane_b32 v61, s89, 33 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB91_3: ; %Flow ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -168307,30 +168307,29 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 ; SI-NEXT: v_readfirstlane_b32 s6, v9 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 -; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 -; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; SI-NEXT: v_writelane_b32 v61, s6, 26 ; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 -; SI-NEXT: v_writelane_b32 v61, s7, 27 +; SI-NEXT: v_writelane_b32 v61, s6, 26 ; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v3 @@ -168351,6 +168350,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_writelane_b32 v61, s7, 27 ; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 ; SI-NEXT: s_mov_b32 s17, s26 ; SI-NEXT: s_mov_b32 s11, s20 @@ -168801,8 +168801,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v62, 0 -; SI-NEXT: v_readlane_b32 s9, v62, 1 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 1 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 2 ; SI-NEXT: v_readlane_b32 s9, v62, 3 @@ -168829,10 +168829,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s86, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 9 @@ -168860,8 +168860,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s80, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 13 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 13 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 14 ; SI-NEXT: v_readlane_b32 s9, v62, 15 @@ -168890,9 +168890,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 18 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 19 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: v_readlane_b32 s9, v62, 19 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 20 ; SI-NEXT: v_readlane_b32 s9, v62, 21 @@ -168921,9 +168920,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 24 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s52, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 25 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: v_readlane_b32 s9, v62, 25 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 26 ; SI-NEXT: v_readlane_b32 s9, v62, 27 @@ -168952,9 +168950,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 30 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s30, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: v_readlane_b32 s9, v62, 31 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 32 ; SI-NEXT: v_readlane_b32 s9, v62, 33 @@ -168983,8 +168980,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 36 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 37 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 37 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 38 ; SI-NEXT: v_readlane_b32 s9, v62, 39 @@ -169013,9 +169010,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 42 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s92, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 43 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: v_readlane_b32 s9, v62, 43 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 44 ; SI-NEXT: v_readlane_b32 s9, v62, 45 @@ -169044,8 +169040,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 48 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 49 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 49 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 50 ; SI-NEXT: v_readlane_b32 s9, v62, 51 @@ -169074,9 +169070,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 55 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: v_readlane_b32 s9, v62, 55 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 56 ; SI-NEXT: v_readlane_b32 s9, v62, 57 @@ -169105,9 +169100,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 61 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: v_readlane_b32 s9, v62, 61 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 62 ; SI-NEXT: v_readlane_b32 s9, v62, 63 @@ -169136,9 +169130,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 2 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 3 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s27, v61, 1 +; SI-NEXT: v_readlane_b32 s9, v61, 3 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 4 ; SI-NEXT: v_readlane_b32 s9, v61, 5 @@ -169167,8 +169160,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 9 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v61, 9 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 10 ; SI-NEXT: v_readlane_b32 s9, v61, 11 @@ -169197,8 +169190,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 14 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s16, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 15 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v61, 15 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 16 ; SI-NEXT: v_readlane_b32 s9, v61, 17 @@ -169223,14 +169216,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s19, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_readlane_b32 s9, v61, 21 +; SI-NEXT: v_readlane_b32 s9, v61, 23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: v_readlane_b32 s27, v61, 1 ; SI-NEXT: v_readlane_b32 s61, v62, 35 ; SI-NEXT: v_readlane_b32 s43, v62, 47 ; SI-NEXT: v_readlane_b32 s27, v61, 7 ; SI-NEXT: v_readlane_b32 s21, v61, 13 ; SI-NEXT: v_readlane_b32 s17, v61, 19 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -169273,11 +169276,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 20 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s10, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 21 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 22 -; SI-NEXT: v_readlane_b32 s9, v61, 23 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: v_readlane_b32 s10, v61, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -169301,16 +169302,16 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s11, v61, 25 +; SI-NEXT: v_readlane_b32 s9, v61, 29 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 28 -; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_readlane_b32 s8, v61, 30 -; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v61, 32 @@ -169332,7 +169333,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -170923,7 +170923,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s6, 0xff ; VI-NEXT: v_readlane_b32 s6, v22, 49 -; VI-NEXT: v_readlane_b32 s9, v22, 5 ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_readlane_b32 s6, v22, 48 @@ -170980,6 +170979,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s9, v22, 5 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -171030,42 +171030,41 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: .LBB91_4: ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: v_writelane_b32 v22, s60, 0 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: v_writelane_b32 v22, s62, 2 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: v_writelane_b32 v22, s63, 3 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: v_writelane_b32 v22, s72, 4 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: v_writelane_b32 v22, s73, 5 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: v_writelane_b32 v22, s74, 6 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: v_writelane_b32 v22, s75, 7 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: v_writelane_b32 v22, s76, 8 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 @@ -171079,6 +171078,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr77 ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr78 @@ -173106,11 +173106,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: .LBB91_2: ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: s_mov_b32 s104, -1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -173213,7 +173213,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v20, s42, 0 -; GFX11-NEXT: v_writelane_b32 v20, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -173223,14 +173222,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s46, 2 +; GFX11-NEXT: v_writelane_b32 v20, s43, 1 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s47, 3 +; GFX11-NEXT: v_writelane_b32 v20, s46, 2 ; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: v_writelane_b32 v20, s47, 3 ; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 4 -; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 ; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 6 ; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 7 ; GFX11-NEXT: .LBB91_3: ; %Flow @@ -174070,82 +174070,82 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 ; GFX11-NEXT: s_lshr_b32 s75, s42, 8 ; GFX11-NEXT: v_writelane_b32 v20, s58, 0 +; GFX11-NEXT: s_lshr_b32 s58, s63, 24 ; GFX11-NEXT: s_lshr_b32 s26, s26, 16 ; GFX11-NEXT: s_lshr_b32 s65, s73, 24 ; GFX11-NEXT: s_pack_ll_b32_b16 s90, s26, s90 -; GFX11-NEXT: s_lshr_b32 s82, s73, 8 ; GFX11-NEXT: v_writelane_b32 v20, s59, 1 -; GFX11-NEXT: s_lshr_b32 s58, s63, 24 ; GFX11-NEXT: s_lshr_b32 s59, s63, 8 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 ; GFX11-NEXT: s_lshr_b32 s63, s93, 24 -; GFX11-NEXT: s_lshr_b32 s84, s72, 16 +; GFX11-NEXT: s_lshr_b32 s82, s73, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 21 ; GFX11-NEXT: s_lshr_b32 s63, s93, 8 +; GFX11-NEXT: s_lshr_b32 s84, s72, 16 ; GFX11-NEXT: s_lshr_b32 s51, s72, 8 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 -; GFX11-NEXT: s_lshr_b32 s86, s77, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 22 ; GFX11-NEXT: s_lshr_b32 s63, s92, 16 +; GFX11-NEXT: s_lshr_b32 s86, s77, 24 ; GFX11-NEXT: s_lshr_b32 s87, s77, 8 ; GFX11-NEXT: s_lshr_b32 s52, s76, 16 -; GFX11-NEXT: s_lshr_b32 s100, s76, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 23 ; GFX11-NEXT: s_lshr_b32 s63, s92, 8 +; GFX11-NEXT: s_lshr_b32 s100, s76, 8 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 ; GFX11-NEXT: s_lshr_b32 s101, s89, 8 -; GFX11-NEXT: s_lshr_b32 s98, s79, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 24 ; GFX11-NEXT: s_lshr_b32 s63, s95, 24 +; GFX11-NEXT: s_lshr_b32 s98, s79, 24 ; GFX11-NEXT: s_lshr_b32 s99, s79, 8 ; GFX11-NEXT: s_lshr_b32 s53, s78, 16 -; GFX11-NEXT: s_lshr_b32 s97, s78, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 25 ; GFX11-NEXT: s_lshr_b32 s63, s95, 8 +; GFX11-NEXT: s_lshr_b32 s97, s78, 8 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 -; GFX11-NEXT: s_lshr_b32 s102, s94, 16 ; GFX11-NEXT: v_writelane_b32 v20, s63, 26 ; GFX11-NEXT: s_lshr_b32 s63, s43, 24 +; GFX11-NEXT: s_lshr_b32 s102, s94, 16 ; GFX11-NEXT: s_lshr_b32 s103, s94, 8 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 -; GFX11-NEXT: s_lshr_b32 s73, s91, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 27 ; GFX11-NEXT: s_lshr_b32 s63, s43, 8 +; GFX11-NEXT: s_lshr_b32 s73, s91, 24 ; GFX11-NEXT: s_lshr_b32 s77, s91, 8 ; GFX11-NEXT: s_lshr_b32 s83, s90, 8 -; GFX11-NEXT: s_lshr_b32 s66, s37, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 28 ; GFX11-NEXT: s_lshr_b32 s63, s42, 16 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[42:43], 24 +; GFX11-NEXT: s_lshr_b32 s66, s37, 24 ; GFX11-NEXT: s_lshr_b32 s67, s37, 8 -; GFX11-NEXT: s_lshr_b32 s68, s36, 16 ; GFX11-NEXT: v_writelane_b32 v20, s42, 6 +; GFX11-NEXT: s_lshr_b32 s68, s36, 16 ; GFX11-NEXT: s_lshr_b32 s49, s36, 8 ; GFX11-NEXT: s_lshr_b32 s69, s35, 24 ; GFX11-NEXT: s_lshr_b32 s70, s35, 8 -; GFX11-NEXT: s_lshr_b32 s64, s34, 16 ; GFX11-NEXT: v_writelane_b32 v20, s43, 7 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[44:45], 24 +; GFX11-NEXT: s_lshr_b32 s64, s34, 16 ; GFX11-NEXT: s_lshr_b32 s80, s34, 8 ; GFX11-NEXT: s_lshr_b32 s79, s45, 24 -; GFX11-NEXT: s_lshr_b32 s93, s45, 8 ; GFX11-NEXT: v_writelane_b32 v20, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s89, 24 +; GFX11-NEXT: s_lshr_b32 s93, s45, 8 ; GFX11-NEXT: s_lshr_b32 s95, s44, 16 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s44, 8 -; GFX11-NEXT: s_lshr_b32 s34, s47, 24 -; GFX11-NEXT: s_lshr_b32 s55, s47, 8 ; GFX11-NEXT: v_writelane_b32 v20, s43, 5 ; GFX11-NEXT: s_lshr_b32 s43, s88, 16 -; GFX11-NEXT: s_lshr_b32 s42, s89, 24 +; GFX11-NEXT: s_lshr_b32 s34, s47, 24 +; GFX11-NEXT: s_lshr_b32 s55, s47, 8 ; GFX11-NEXT: s_lshr_b32 s35, s46, 16 -; GFX11-NEXT: s_lshr_b32 s36, s46, 8 ; GFX11-NEXT: v_writelane_b32 v20, s43, 29 ; GFX11-NEXT: s_lshr_b32 s43, s88, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 ; GFX11-NEXT: s_lshr_b32 s89, s90, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 ; GFX11-NEXT: v_writelane_b32 v20, s43, 30 +; GFX11-NEXT: s_lshr_b32 s36, s46, 8 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 ; GFX11-NEXT: s_lshr_b32 s37, s57, 24 ; GFX11-NEXT: s_lshr_b32 s38, s57, 8 @@ -174258,9 +174258,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_lshl_b32 s19, s73, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: v_readlane_b32 s17, v20, 1 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s81, v18, 25 +; GFX11-NEXT: s_lshl_b32 s17, s70, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 @@ -174272,14 +174272,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_readlane_b32 s16, v20, 0 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: v_readlane_b32 s17, v20, 1 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v20, 18 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: s_lshl_b32 s17, s70, 8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: v_readlane_b32 s2, v20, 18 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 ; GFX11-NEXT: v_readlane_b32 s70, v18, 22 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: v_readlane_b32 s69, v18, 21 @@ -174308,22 +174308,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s17, s17, s18 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-NEXT: v_readlane_b32 s17, v20, 3 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: v_readlane_b32 s16, v20, 2 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 ; GFX11-NEXT: s_and_b32 s2, s68, 0xff -; GFX11-NEXT: v_readlane_b32 s17, v20, 3 +; GFX11-NEXT: s_lshl_b32 s17, s66, 8 ; GFX11-NEXT: s_lshl_b32 s3, s16, 8 ; GFX11-NEXT: v_readlane_b32 s16, v20, 20 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s67, 8 -; GFX11-NEXT: s_lshl_b32 s17, s66, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -174336,9 +174335,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s27, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s77, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s18, s71, 0xff ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 @@ -191731,8 +191731,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v37 ; SI-NEXT: s_or_b32 s42, s5, s4 @@ -191766,8 +191766,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v20 ; SI-NEXT: s_or_b32 s28, s5, s4 @@ -191847,8 +191847,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v29 ; SI-NEXT: s_or_b32 s22, s5, s4 @@ -191864,8 +191864,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s20, s5, s4 @@ -192030,97 +192030,104 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_mov_b32_e32 v51, v42 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v21 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v21, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v40, v34 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v13, v12 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v57, v30 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v58, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v10 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v54, v9 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v43 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v25, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v1, v52 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: v_writelane_b32 v62, s80, 46 +; SI-NEXT: v_writelane_b32 v62, s81, 47 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: v_writelane_b32 v62, s80, 48 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_mov_b32_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v57, v30 +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: v_writelane_b32 v62, s81, 49 +; SI-NEXT: v_mov_b32_e32 v25, v59 +; SI-NEXT: v_mov_b32_e32 v1, v52 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -192190,14 +192197,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: v_writelane_b32 v62, s80, 46 -; SI-NEXT: v_writelane_b32 v62, s81, 47 -; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s80, 48 -; SI-NEXT: v_writelane_b32 v62, s81, 49 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: .LBB95_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v14, v17 @@ -192731,11 +192731,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b32 s71, s45, 8 ; SI-NEXT: .LBB95_5: ; %end ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 ; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 ; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 ; SI-NEXT: s_or_b32 s44, s44, s47 ; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 @@ -192756,10 +192754,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v13, v21, v13 ; SI-NEXT: v_or_b32_e32 v13, s44, v13 ; SI-NEXT: v_readlane_b32 s44, v62, 6 -; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_lshl_b32 s44, s44, 8 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 +; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_or_b32 s42, s42, s44 ; SI-NEXT: v_readlane_b32 s44, v62, 8 ; SI-NEXT: v_readlane_b32 s45, v62, 9 @@ -192781,9 +192778,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v12, v23, v12 ; SI-NEXT: v_or_b32_e32 v12, s42, v12 ; SI-NEXT: v_readlane_b32 s42, v62, 12 -; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_lshl_b32 s42, s42, 8 ; SI-NEXT: s_and_b32 s40, s40, 0xff +; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_or_b32 s40, s40, s42 ; SI-NEXT: v_readlane_b32 s42, v62, 14 ; SI-NEXT: v_readlane_b32 s43, v62, 15 @@ -192805,9 +192802,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v11, v24, v11 ; SI-NEXT: v_or_b32_e32 v11, s40, v11 ; SI-NEXT: v_readlane_b32 s40, v62, 18 -; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_lshl_b32 s40, s40, 8 ; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_or_b32 s28, s28, s40 ; SI-NEXT: v_readlane_b32 s40, v62, 20 ; SI-NEXT: v_readlane_b32 s41, v62, 21 @@ -192829,9 +192826,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v25, s28, v25 ; SI-NEXT: v_readlane_b32 s28, v62, 24 -; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_lshl_b32 s28, s28, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_or_b32 s26, s26, s28 ; SI-NEXT: v_readlane_b32 s28, v62, 26 ; SI-NEXT: v_readlane_b32 s29, v62, 27 @@ -192853,14 +192850,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v27, s26, v27 ; SI-NEXT: v_readlane_b32 s26, v62, 30 -; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_lshl_b32 s26, s26, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 +; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_or_b32 s24, s24, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 ; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: s_and_b32 s26, s26, 0xff ; SI-NEXT: v_readlane_b32 s28, v62, 34 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: s_lshl_b32 s27, s28, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 @@ -192894,11 +192894,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: v_readlane_b32 s24, v62, 38 ; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen @@ -196410,7 +196410,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -196421,8 +196420,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -196485,7 +196486,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB95_2 ; GFX11-NEXT: .LBB95_4: @@ -196568,9 +196568,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v78, 18 ; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 ; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v42, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 ; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 @@ -196630,10 +196631,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB95_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 @@ -214805,7 +214805,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s17, 23 ; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 ; SI-NEXT: v_writelane_b32 v41, s16, 20 -; SI-NEXT: v_writelane_b32 v41, s17, 21 ; SI-NEXT: s_lshr_b32 s16, s61, 24 ; SI-NEXT: v_writelane_b32 v43, s16, 18 ; SI-NEXT: s_lshr_b32 s16, s61, 16 @@ -214907,6 +214906,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s16, s5, 16 ; SI-NEXT: v_writelane_b32 v42, s16, 0 ; SI-NEXT: s_lshr_b32 s16, s5, 8 +; SI-NEXT: v_writelane_b32 v41, s17, 21 ; SI-NEXT: v_writelane_b32 v42, s16, 1 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 ; SI-NEXT: v_writelane_b32 v41, s16, 28 @@ -215049,7 +215049,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s17, v41, 25 ; SI-NEXT: s_lshl_b32 s17, s16, 8 ; SI-NEXT: s_and_b32 s18, s56, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 23 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_readlane_b32 s18, v41, 26 ; SI-NEXT: v_readlane_b32 s19, v41, 27 @@ -215191,7 +215190,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: s_lshl_b32 s17, s90, 8 ; SI-NEXT: s_and_b32 s18, s46, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 29 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_readlane_b32 s18, v41, 30 ; SI-NEXT: v_readlane_b32 s19, v41, 31 @@ -215210,82 +215208,78 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s16, 0xff ; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: v_readlane_b32 s17, v41, 35 -; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 37 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: v_readlane_b32 s17, v41, 35 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 38 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 42 -; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 42 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v41, 41 +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: v_readlane_b32 s18, v41, 40 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 43 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 42 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 44 @@ -215316,9 +215310,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 46 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v41, 47 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v41, 45 +; SI-NEXT: v_readlane_b32 s17, v41, 47 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 48 ; SI-NEXT: v_readlane_b32 s17, v41, 49 @@ -215351,8 +215344,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 52 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v41, 53 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v41, 53 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 54 ; SI-NEXT: v_readlane_b32 s15, v41, 55 @@ -215385,8 +215378,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 58 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v41, 59 ; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: v_readlane_b32 s13, v41, 59 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 60 ; SI-NEXT: v_readlane_b32 s13, v41, 61 @@ -215419,8 +215412,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 0 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v43, 1 ; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_readlane_b32 s11, v43, 1 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 2 ; SI-NEXT: v_readlane_b32 s11, v43, 3 @@ -215453,8 +215446,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 6 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 7 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 8 ; SI-NEXT: v_readlane_b32 s9, v43, 9 @@ -215487,8 +215480,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 14 ; SI-NEXT: v_readlane_b32 s7, v43, 15 @@ -215508,14 +215501,21 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_readlane_b32 s5, v42, 0 +; SI-NEXT: v_readlane_b32 s19, v41, 37 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: v_readlane_b32 s6, v43, 63 +; SI-NEXT: v_readlane_b32 s19, v41, 39 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_readlane_b32 s19, v41, 41 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s21, v41, 23 +; SI-NEXT: v_readlane_b32 s19, v41, 43 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s21, v41, 29 +; SI-NEXT: v_readlane_b32 s19, v41, 45 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -215573,47 +215573,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v41, s4, 20 ; SI-NEXT: v_writelane_b32 v41, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: v_writelane_b32 v41, s4, 22 -; SI-NEXT: v_writelane_b32 v41, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s5, 23 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 @@ -215625,7 +215592,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: v_writelane_b32 v41, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -215713,6 +215679,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s5, 29 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 @@ -215766,7 +215733,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v41, s4, 62 -; SI-NEXT: v_writelane_b32 v41, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v43, s4, 0 @@ -215786,14 +215752,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s4, 10 ; SI-NEXT: v_writelane_b32 v43, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v43, s16, 12 ; SI-NEXT: v_writelane_b32 v43, s17, 13 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s16, 14 ; SI-NEXT: v_writelane_b32 v43, s17, 15 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v43, s16, 16 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: v_writelane_b32 v41, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s17, 17 ; SI-NEXT: s_branch .LBB99_2 ; @@ -218766,7 +218766,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -218777,8 +218776,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -218841,7 +218842,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB99_2 ; GFX11-NEXT: .LBB99_4: @@ -218924,9 +218924,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v78, 18 ; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 ; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v42, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 ; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 @@ -218986,10 +218987,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB99_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index a0c596ff9d5de..a7f89579b5ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -66747,11 +66747,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s17, 13 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_readlane_b32 s18, v21, 0 -; SI-NEXT: v_readlane_b32 s19, v21, 1 +; SI-NEXT: s_and_b32 s16, s40, 0xff ; SI-NEXT: s_lshl_b32 s17, s18, 8 ; SI-NEXT: v_readlane_b32 s18, v21, 2 -; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v21, 4 @@ -66773,9 +66771,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v21, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v21, 5 +; SI-NEXT: v_readlane_b32 s17, v21, 7 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 8 ; SI-NEXT: v_readlane_b32 s17, v21, 9 @@ -66807,8 +66804,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 12 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s15, v21, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v21, 13 ; SI-NEXT: s_or_b32 s10, s10, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 14 ; SI-NEXT: v_readlane_b32 s15, v21, 15 @@ -66959,10 +66956,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: v_readlane_b32 s19, v21, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s19, v21, 5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -67017,6 +67017,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s4, 0 ; SI-NEXT: v_writelane_b32 v21, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: v_writelane_b32 v21, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 4 +; SI-NEXT: v_writelane_b32 v21, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 6 +; SI-NEXT: v_writelane_b32 v21, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 8 +; SI-NEXT: v_writelane_b32 v21, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 10 +; SI-NEXT: v_writelane_b32 v21, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 12 +; SI-NEXT: v_writelane_b32 v21, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 14 +; SI-NEXT: v_writelane_b32 v21, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 16 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr74 @@ -67044,6 +67066,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: v_writelane_b32 v21, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 @@ -67060,33 +67083,10 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: v_writelane_b32 v21, s4, 2 -; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 16 -; SI-NEXT: v_writelane_b32 v21, s5, 17 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: @@ -88410,8 +88410,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s4, 0 -; SI-NEXT: v_writelane_b32 v41, s5, 1 ; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_writelane_b32 v41, s5, 1 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 37cbd2d926413..34abba10f6c61 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -150,8 +150,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, vcc_lo +; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index 5f36d5403ebcf..744871d8c84ff 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -65,12 +65,12 @@ body: | ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -141,12 +141,12 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 76f204dd0c16a..420f003d4f417 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -41,10 +41,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v6, s70, 20 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v6, s71, 21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v7, s8, 0 ; CHECK-NEXT: v_writelane_b32 v7, s9, 1 ; CHECK-NEXT: v_writelane_b32 v7, s10, 2 @@ -76,15 +76,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s64, 28 ; CHECK-NEXT: v_writelane_b32 v7, s65, 29 ; CHECK-NEXT: v_writelane_b32 v7, s66, 30 +; CHECK-NEXT: v_writelane_b32 v7, s67, 31 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s52, v7, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_readlane_b32 s52, v7, 0 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -92,12 +91,13 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 5 ; CHECK-NEXT: v_readlane_b32 s58, v7, 6 ; CHECK-NEXT: v_readlane_b32 s59, v7, 7 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v5 ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 -; CHECK-NEXT: v_readlane_b32 s62, v7, 10 ; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s62, v7, 10 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 @@ -109,7 +109,6 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s52, v7, 16 ; CHECK-NEXT: v_readlane_b32 s60, v7, 24 ; CHECK-NEXT: v_readlane_b32 s61, v7, 25 ; CHECK-NEXT: v_readlane_b32 s62, v7, 26 @@ -120,10 +119,11 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: v_readlane_b32 s52, v7, 16 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 -; CHECK-NEXT: v_readlane_b32 s55, v7, 19 ; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s55, v7, 19 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 @@ -152,10 +152,18 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 ; CHECK-NEXT: v_mov_b32_e32 v0, s16 -; CHECK-NEXT: v_readlane_b32 s44, v7, 16 +; CHECK-NEXT: v_readlane_b32 s52, v7, 24 +; CHECK-NEXT: v_readlane_b32 s53, v7, 25 +; CHECK-NEXT: v_readlane_b32 s54, v7, 26 +; CHECK-NEXT: v_readlane_b32 s55, v7, 27 +; CHECK-NEXT: v_readlane_b32 s56, v7, 28 +; CHECK-NEXT: v_readlane_b32 s57, v7, 29 +; CHECK-NEXT: v_readlane_b32 s58, v7, 30 +; CHECK-NEXT: v_readlane_b32 s59, v7, 31 ; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: v_readlane_b32 s44, v7, 16 ; CHECK-NEXT: v_readlane_b32 s45, v7, 17 ; CHECK-NEXT: v_readlane_b32 s46, v7, 18 ; CHECK-NEXT: v_readlane_b32 s47, v7, 19 @@ -163,16 +171,6 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s49, v7, 21 ; CHECK-NEXT: v_readlane_b32 s50, v7, 22 ; CHECK-NEXT: v_readlane_b32 s51, v7, 23 -; CHECK-NEXT: v_readlane_b32 s52, v7, 24 -; CHECK-NEXT: v_readlane_b32 s53, v7, 25 -; CHECK-NEXT: v_readlane_b32 s54, v7, 26 -; CHECK-NEXT: v_readlane_b32 s55, v7, 27 -; CHECK-NEXT: v_readlane_b32 s56, v7, 28 -; CHECK-NEXT: v_readlane_b32 s57, v7, 29 -; CHECK-NEXT: v_readlane_b32 s58, v7, 30 -; CHECK-NEXT: v_readlane_b32 s59, v7, 31 -; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s44, v7, 0 ; CHECK-NEXT: v_readlane_b32 s52, v7, 8 ; CHECK-NEXT: v_readlane_b32 s53, v7, 9 ; CHECK-NEXT: v_readlane_b32 s54, v7, 10 @@ -181,12 +179,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 13 ; CHECK-NEXT: v_readlane_b32 s58, v7, 14 ; CHECK-NEXT: v_readlane_b32 s59, v7, 15 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_readlane_b32 s44, v7, 0 ; CHECK-NEXT: v_readlane_b32 s45, v7, 1 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s46, v7, 2 ; CHECK-NEXT: v_readlane_b32 s47, v7, 3 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s48, v7, 4 ; CHECK-NEXT: v_readlane_b32 s49, v7, 5 ; CHECK-NEXT: v_readlane_b32 s50, v7, 6 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..4a89b2fcc017c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1826,10 +1826,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s0, s0, s5 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_readlane_b32 s1, v6, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 83c240c17ff1c..9fdc72f054f90 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10279,11 +10279,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_writelane_b32 v62, s3, 5 ; GFX8-NEXT: v_readlane_b32 s2, v62, 2 ; GFX8-NEXT: v_readlane_b32 s3, v62, 3 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_mov_b32_e32 v35, s49 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 ; GFX8-NEXT: v_mov_b32_e32 v13, s73 @@ -10577,8 +10577,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: v_readlane_b32 s3, v62, 5 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index f4a9e7e8f2759..148ddb4237e3d 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -139,13 +139,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 ; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s15, s21 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_mov_b32_e32 v40, v32 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -232,20 +232,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s6, v39, 8 -; GFX906-NEXT: v_readlane_b32 s8, v39, 6 -; GFX906-NEXT: v_readlane_b32 s10, v39, 4 -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 +; GFX906-NEXT: v_readlane_b32 s4, v39, 10 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -253,19 +253,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s6, v39, 8 -; GFX906-NEXT: v_readlane_b32 s8, v39, 6 -; GFX906-NEXT: v_readlane_b32 s10, v39, 4 -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_readlane_b32 s21, v39, 12 ; GFX906-NEXT: ;;#ASMSTART @@ -518,13 +518,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 ; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_mov_b32_e32 v40, v32 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -611,20 +611,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s6, v39, 8 -; GFX908-NEXT: v_readlane_b32 s8, v39, 6 -; GFX908-NEXT: v_readlane_b32 s10, v39, 4 -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 +; GFX908-NEXT: v_readlane_b32 s4, v39, 10 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -632,19 +632,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s6, v39, 8 -; GFX908-NEXT: v_readlane_b32 s8, v39, 6 -; GFX908-NEXT: v_readlane_b32 s10, v39, 4 -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_readlane_b32 s21, v39, 12 ; GFX908-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 15f5f890d57b5..d1dee534414ac 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -8,6 +8,7 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-LABEL: kernel0: ; CHECK: ; %bb.0: +; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART @@ -19,10 +20,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[48:51] ; CHECK-NEXT: ;;#ASMEND @@ -123,19 +123,19 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v22, s0, 58 ; CHECK-NEXT: v_writelane_b32 v22, s1, 59 ; CHECK-NEXT: v_writelane_b32 v22, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v22, s3, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v22, s0, 62 ; CHECK-NEXT: v_writelane_b32 v23, s2, 0 ; CHECK-NEXT: v_writelane_b32 v23, s3, 1 ; CHECK-NEXT: v_writelane_b32 v23, s4, 2 ; CHECK-NEXT: v_writelane_b32 v23, s5, 3 ; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v22, s1, 63 +; CHECK-NEXT: v_writelane_b32 v22, s0, 62 ; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: v_writelane_b32 v22, s1, 63 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND @@ -208,6 +208,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 2 ; CHECK-NEXT: v_readlane_b32 s1, v22, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s2, v22, 4 ; CHECK-NEXT: v_readlane_b32 s3, v22, 5 ; CHECK-NEXT: v_readlane_b32 s4, v22, 6 @@ -215,9 +218,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s6, v22, 8 ; CHECK-NEXT: v_readlane_b32 s7, v22, 9 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 10 @@ -241,29 +241,23 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 26 ; CHECK-NEXT: v_readlane_b32 s1, v22, 27 -; CHECK-NEXT: v_readlane_b32 s2, v22, 28 -; CHECK-NEXT: v_readlane_b32 s3, v22, 29 -; CHECK-NEXT: v_readlane_b32 s4, v22, 30 -; CHECK-NEXT: v_readlane_b32 s5, v22, 31 -; CHECK-NEXT: v_readlane_b32 s6, v22, 32 -; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[44:47] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s2, v22, 28 +; CHECK-NEXT: v_readlane_b32 s3, v22, 29 +; CHECK-NEXT: v_readlane_b32 s4, v22, 30 +; CHECK-NEXT: v_readlane_b32 s5, v22, 31 +; CHECK-NEXT: v_readlane_b32 s6, v22, 32 +; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 34 ; CHECK-NEXT: v_readlane_b32 s1, v22, 35 -; CHECK-NEXT: v_readlane_b32 s2, v22, 36 -; CHECK-NEXT: v_readlane_b32 s3, v22, 37 -; CHECK-NEXT: v_readlane_b32 s4, v22, 38 -; CHECK-NEXT: v_readlane_b32 s5, v22, 39 -; CHECK-NEXT: v_readlane_b32 s6, v22, 40 -; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND @@ -273,6 +267,12 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[40:43] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s2, v22, 36 +; CHECK-NEXT: v_readlane_b32 s3, v22, 37 +; CHECK-NEXT: v_readlane_b32 s4, v22, 38 +; CHECK-NEXT: v_readlane_b32 s5, v22, 39 +; CHECK-NEXT: v_readlane_b32 s6, v22, 40 +; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -297,11 +297,11 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 58 ; CHECK-NEXT: v_readlane_b32 s1, v22, 59 -; CHECK-NEXT: v_readlane_b32 s2, v22, 60 -; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 8f8e2c0ba52fc..f196004e7660b 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -276,10 +276,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 -; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 ; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 +; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 ; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 @@ -587,11 +587,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e9a0671ead4e0..57ddcb20d613c 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -158,7 +158,6 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 2 ; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 72672c8b6efad..6a3a58e3ab120 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -135,7 +135,6 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill @@ -965,7 +964,6 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill