From f23f78ce118481e650d5573263daf312e3da071e Mon Sep 17 00:00:00 2001 From: Nathan Corbyn Date: Fri, 21 Nov 2025 17:21:04 +0000 Subject: [PATCH] =?UTF-8?q?Revert=20"[AMDGPU]=20Remove=20leftover=20implic?= =?UTF-8?q?it=20operands=20from=20SI=5FSPILL/SI=5FRESTORE.=20=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b79a665f7170fbb631b13175ec747ccfd779bf9e. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 1272 ++++++++--------- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 62 +- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll | 2 +- .../CodeGen/AMDGPU/fold-reload-into-exec.mir | 16 +- .../identical-subrange-spill-infloop.ll | 38 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 4 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 4 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 40 +- .../scc-clobbered-sgpr-to-vmem-spill.ll | 44 +- .../AMDGPU/tuple-allocation-failure.ll | 8 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 1 + llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 2 + 13 files changed, 750 insertions(+), 747 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 66e1873319553..a7333e3373f38 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2094,11 +2094,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; case AMDGPU::SI_SPILL_S32_TO_VGPR: - mutateAndCleanupImplicit(MI, get(AMDGPU::V_WRITELANE_B32)); + MI.setDesc(get(AMDGPU::V_WRITELANE_B32)); break; case AMDGPU::SI_RESTORE_S32_FROM_VGPR: - mutateAndCleanupImplicit(MI, get(AMDGPU::V_READLANE_B32)); + MI.setDesc(get(AMDGPU::V_READLANE_B32)); break; case AMDGPU::AV_MOV_B32_IMM_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 4519e2796fed3..4c5c56a49fdc6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -8181,8 +8181,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 0 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: v_readlane_b32 s19, v23, 1 +; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 @@ -8215,8 +8215,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 7 +; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 @@ -8249,8 +8249,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: v_readlane_b32 s15, v23, 13 +; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 @@ -8283,8 +8283,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: v_readlane_b32 s13, v23, 19 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 @@ -8317,8 +8317,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: v_readlane_b32 s11, v23, 25 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 @@ -8350,8 +8350,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v23, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 @@ -8384,8 +8384,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v23, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: v_readlane_b32 s7, v23, 37 +; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v23, 38 ; SI-NEXT: v_readlane_b32 s7, v23, 39 @@ -8468,149 +8468,148 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s50, 0 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s51, 1 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 2 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 2 ; SI-NEXT: v_writelane_b32 v23, s51, 3 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 4 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 4 ; SI-NEXT: v_writelane_b32 v23, s51, 5 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 6 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 6 ; SI-NEXT: v_writelane_b32 v23, s51, 7 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 8 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 8 ; SI-NEXT: v_writelane_b32 v23, s51, 9 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 10 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 10 ; SI-NEXT: v_writelane_b32 v23, s51, 11 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 12 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 12 ; SI-NEXT: v_writelane_b32 v23, s51, 13 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 14 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 14 ; SI-NEXT: v_writelane_b32 v23, s51, 15 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 16 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 16 ; SI-NEXT: v_writelane_b32 v23, s51, 17 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 18 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 18 ; SI-NEXT: v_writelane_b32 v23, s51, 19 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 20 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 20 ; SI-NEXT: v_writelane_b32 v23, s51, 21 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 22 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 22 ; SI-NEXT: v_writelane_b32 v23, s51, 23 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 24 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 24 ; SI-NEXT: v_writelane_b32 v23, s51, 25 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 26 ; SI-NEXT: v_writelane_b32 v23, s51, 27 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 28 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 28 ; SI-NEXT: v_writelane_b32 v23, s51, 29 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 30 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 30 ; SI-NEXT: v_writelane_b32 v23, s51, 31 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 32 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 32 ; SI-NEXT: v_writelane_b32 v23, s51, 33 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 34 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 34 ; SI-NEXT: v_writelane_b32 v23, s51, 35 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 36 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 36 ; SI-NEXT: v_writelane_b32 v23, s51, 37 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 38 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 38 ; SI-NEXT: v_writelane_b32 v23, s51, 39 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr54 @@ -8635,6 +8634,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -10597,9 +10597,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: v_writelane_b32 v22, s82, 0 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 @@ -10632,7 +10633,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 ; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -10985,12 +10985,14 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB13_3 ; GFX11-NEXT: .LBB13_2: -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: s_mov_b32 s101, -1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -10999,6 +11001,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11010,7 +11013,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11022,6 +11024,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11033,21 +11036,18 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr30 @@ -11111,17 +11111,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB13_3: ; %Flow @@ -44444,149 +44444,147 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v37 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s60, 0 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: v_writelane_b32 v61, s61, 1 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 2 -; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 3 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 2 +; SI-NEXT: v_writelane_b32 v61, s61, 3 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 5 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 4 +; SI-NEXT: v_writelane_b32 v61, s61, 5 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 6 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 7 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 6 +; SI-NEXT: v_writelane_b32 v61, s61, 7 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 8 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 9 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 8 +; SI-NEXT: v_writelane_b32 v61, s61, 9 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 10 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 11 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 10 +; SI-NEXT: v_writelane_b32 v61, s61, 11 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 12 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 13 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 12 +; SI-NEXT: v_writelane_b32 v61, s61, 13 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 14 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 15 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 14 +; SI-NEXT: v_writelane_b32 v61, s61, 15 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 16 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 17 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 16 +; SI-NEXT: v_writelane_b32 v61, s61, 17 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 18 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 19 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 18 +; SI-NEXT: v_writelane_b32 v61, s61, 19 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 20 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 21 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 20 +; SI-NEXT: v_writelane_b32 v61, s61, 21 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 22 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 23 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 22 +; SI-NEXT: v_writelane_b32 v61, s61, 23 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 24 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 25 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 24 +; SI-NEXT: v_writelane_b32 v61, s61, 25 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 26 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 27 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 26 +; SI-NEXT: v_writelane_b32 v61, s61, 27 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 28 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 29 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 28 +; SI-NEXT: v_writelane_b32 v61, s61, 29 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 30 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 31 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 30 +; SI-NEXT: v_writelane_b32 v61, s61, 31 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 32 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 33 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 32 +; SI-NEXT: v_writelane_b32 v61, s61, 33 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 34 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 35 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 34 +; SI-NEXT: v_writelane_b32 v61, s61, 35 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 36 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 37 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 36 +; SI-NEXT: v_writelane_b32 v61, s61, 37 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 38 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 39 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 38 +; SI-NEXT: v_writelane_b32 v61, s61, 39 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: v_writelane_b32 v61, s60, 40 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr28 @@ -44595,6 +44593,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s60, 40 +; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 @@ -44766,7 +44766,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v55, s97 ; SI-NEXT: v_mov_b32_e32 v54, s96 ; SI-NEXT: v_mov_b32_e32 v52, s60 ; SI-NEXT: v_mov_b32_e32 v47, s28 @@ -44787,47 +44786,48 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v31, s46 ; SI-NEXT: v_mov_b32_e32 v36, s56 ; SI-NEXT: v_readlane_b32 s26, v61, 40 -; SI-NEXT: v_readlane_b32 s27, v61, 41 ; SI-NEXT: v_readlane_b32 s28, v61, 38 -; SI-NEXT: v_readlane_b32 s29, v61, 39 ; SI-NEXT: v_readlane_b32 s6, v61, 36 -; SI-NEXT: v_readlane_b32 s7, v61, 37 ; SI-NEXT: v_readlane_b32 s58, v61, 34 -; SI-NEXT: v_readlane_b32 s59, v61, 35 ; SI-NEXT: v_readlane_b32 s60, v61, 32 -; SI-NEXT: v_readlane_b32 s61, v61, 33 ; SI-NEXT: v_readlane_b32 s8, v61, 30 -; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: v_readlane_b32 s10, v61, 28 -; SI-NEXT: v_readlane_b32 s11, v61, 29 ; SI-NEXT: v_readlane_b32 s12, v61, 26 -; SI-NEXT: v_readlane_b32 s13, v61, 27 ; SI-NEXT: v_readlane_b32 s14, v61, 24 -; SI-NEXT: v_readlane_b32 s15, v61, 25 ; SI-NEXT: v_readlane_b32 s16, v61, 22 -; SI-NEXT: v_readlane_b32 s17, v61, 23 ; SI-NEXT: s_mov_b32 s96, s94 ; SI-NEXT: v_readlane_b32 s94, v61, 20 -; SI-NEXT: v_readlane_b32 s95, v61, 21 ; SI-NEXT: v_readlane_b32 s18, v61, 18 -; SI-NEXT: v_readlane_b32 s19, v61, 19 ; SI-NEXT: v_readlane_b32 s20, v61, 16 -; SI-NEXT: v_readlane_b32 s21, v61, 17 ; SI-NEXT: v_readlane_b32 s22, v61, 14 -; SI-NEXT: v_readlane_b32 s23, v61, 15 ; SI-NEXT: v_readlane_b32 s24, v61, 12 -; SI-NEXT: v_readlane_b32 s25, v61, 13 ; SI-NEXT: v_readlane_b32 s40, v61, 10 -; SI-NEXT: v_readlane_b32 s41, v61, 11 ; SI-NEXT: v_readlane_b32 s42, v61, 8 -; SI-NEXT: v_readlane_b32 s43, v61, 9 ; SI-NEXT: v_readlane_b32 s44, v61, 6 -; SI-NEXT: v_readlane_b32 s45, v61, 7 ; SI-NEXT: v_readlane_b32 s46, v61, 4 -; SI-NEXT: v_readlane_b32 s47, v61, 5 ; SI-NEXT: v_readlane_b32 s56, v61, 2 -; SI-NEXT: v_readlane_b32 s57, v61, 3 ; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0 +; SI-NEXT: v_mov_b32_e32 v55, s97 +; SI-NEXT: v_readlane_b32 s27, v61, 41 +; SI-NEXT: v_readlane_b32 s29, v61, 39 +; SI-NEXT: v_readlane_b32 s7, v61, 37 +; SI-NEXT: v_readlane_b32 s59, v61, 35 +; SI-NEXT: v_readlane_b32 s61, v61, 33 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: v_readlane_b32 s11, v61, 29 +; SI-NEXT: v_readlane_b32 s13, v61, 27 +; SI-NEXT: v_readlane_b32 s15, v61, 25 +; SI-NEXT: v_readlane_b32 s17, v61, 23 +; SI-NEXT: v_readlane_b32 s95, v61, 21 +; SI-NEXT: v_readlane_b32 s19, v61, 19 +; SI-NEXT: v_readlane_b32 s21, v61, 17 +; SI-NEXT: v_readlane_b32 s23, v61, 15 +; SI-NEXT: v_readlane_b32 s25, v61, 13 +; SI-NEXT: v_readlane_b32 s41, v61, 11 +; SI-NEXT: v_readlane_b32 s43, v61, 9 +; SI-NEXT: v_readlane_b32 s45, v61, 7 +; SI-NEXT: v_readlane_b32 s47, v61, 5 +; SI-NEXT: v_readlane_b32 s57, v61, 3 ; SI-NEXT: v_readlane_b32 vcc_hi, v61, 1 ; SI-NEXT: .LBB37_5: ; %end ; SI-NEXT: s_waitcnt vmcnt(14) @@ -81058,8 +81058,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 0 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: v_readlane_b32 s19, v23, 1 +; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 @@ -81092,8 +81092,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 7 +; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 @@ -81126,8 +81126,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: v_readlane_b32 s15, v23, 13 +; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 @@ -81160,8 +81160,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: v_readlane_b32 s13, v23, 19 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 @@ -81194,8 +81194,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: v_readlane_b32 s11, v23, 25 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 @@ -81228,8 +81228,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v23, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 @@ -81347,6 +81347,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s54, 0 ; SI-NEXT: ; implicit-def: $sgpr26 @@ -81355,174 +81357,172 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s54, 2 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s55, 3 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 4 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 5 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 4 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 6 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 5 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 7 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 8 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 6 +; SI-NEXT: v_writelane_b32 v23, s55, 7 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 9 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 10 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 8 +; SI-NEXT: v_writelane_b32 v23, s55, 9 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 11 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 12 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 10 +; SI-NEXT: v_writelane_b32 v23, s55, 11 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 13 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 14 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 12 +; SI-NEXT: v_writelane_b32 v23, s55, 13 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 15 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 16 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 14 +; SI-NEXT: v_writelane_b32 v23, s55, 15 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 17 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 18 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 16 +; SI-NEXT: v_writelane_b32 v23, s55, 17 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 19 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 20 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 18 +; SI-NEXT: v_writelane_b32 v23, s55, 19 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 21 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 22 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 20 +; SI-NEXT: v_writelane_b32 v23, s55, 21 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 23 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 24 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 22 +; SI-NEXT: v_writelane_b32 v23, s55, 23 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 25 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 24 +; SI-NEXT: v_writelane_b32 v23, s55, 25 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 27 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 28 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 26 +; SI-NEXT: v_writelane_b32 v23, s55, 27 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 29 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 30 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 28 +; SI-NEXT: v_writelane_b32 v23, s55, 29 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 31 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 32 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 30 +; SI-NEXT: v_writelane_b32 v23, s55, 31 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 33 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 34 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 32 +; SI-NEXT: v_writelane_b32 v23, s55, 33 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 35 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 36 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 34 +; SI-NEXT: v_writelane_b32 v23, s55, 35 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 37 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 36 +; SI-NEXT: v_writelane_b32 v23, s55, 37 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: v_writelane_b32 v23, s54, 38 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: v_writelane_b32 v23, s55, 39 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 38 +; SI-NEXT: v_writelane_b32 v23, s55, 39 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -83473,9 +83473,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr83 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: v_writelane_b32 v22, s82, 0 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 @@ -83508,7 +83509,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 ; GFX9-NEXT: ; implicit-def: $sgpr81 -; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -83861,12 +83861,14 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB57_3 ; GFX11-NEXT: .LBB57_2: -; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 ; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: s_mov_b32 s101, -1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -83875,6 +83877,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83886,7 +83889,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83898,6 +83900,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83909,7 +83912,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83918,12 +83920,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr30 @@ -83987,17 +83987,17 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 -; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB57_3: ; %Flow @@ -115371,6 +115371,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 0 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115378,6 +115380,37 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 2 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115385,6 +115418,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 4 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115392,6 +115427,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 6 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115399,6 +115436,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115406,6 +115445,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115413,6 +115454,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 12 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115420,6 +115463,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 14 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115427,6 +115472,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 16 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115434,6 +115481,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 18 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115441,6 +115490,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 20 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115448,6 +115499,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 22 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115455,97 +115508,44 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 24 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s49, 25 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 26 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 27 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 28 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 29 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 30 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 31 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 27 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: v_writelane_b32 v62, s48, 32 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v62, s48, 28 +; SI-NEXT: v_writelane_b32 v62, s49, 29 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 30 +; SI-NEXT: v_writelane_b32 v62, s49, 31 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s48, 32 +; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: ; SI-NEXT: v_mov_b32_e32 v1, s38 @@ -115601,9 +115601,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v16, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 30 ; SI-NEXT: v_readlane_b32 s27, v62, 31 -; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_mov_b32_e32 v51, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: v_mov_b32_e32 v38, s72 ; SI-NEXT: v_mov_b32_e32 v49, s62 @@ -164616,8 +164616,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshr_b32 s13, s4, 16 ; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: v_readfirstlane_b32 s4, v46 ; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: v_readfirstlane_b32 s4, v46 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 @@ -164811,92 +164811,191 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v59, v51 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v31, v46 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v34, v22 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v22, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v7, v37 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: v_mov_b32_e32 v44, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_mov_b32_e32 v59, v51 -; SI-NEXT: v_mov_b32_e32 v31, v46 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v7, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v52, v17 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v43, v20 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v42, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v41, v5 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v35, v6 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v32, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v30, v12 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v19, v39 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v39, v25 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v12, v29 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v20, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v6, v55 ; SI-NEXT: v_writelane_b32 v62, s4, 44 ; SI-NEXT: v_writelane_b32 v62, s5, 45 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v17, v8 ; SI-NEXT: v_writelane_b32 v62, s4, 46 ; SI-NEXT: v_writelane_b32 v62, s5, 47 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v29, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 48 ; SI-NEXT: v_writelane_b32 v62, s5, 49 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: v_writelane_b32 v62, s4, 50 ; SI-NEXT: v_writelane_b32 v62, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -164916,6 +165015,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v62, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 62 +; SI-NEXT: v_writelane_b32 v62, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 0 ; SI-NEXT: v_writelane_b32 v61, s5, 1 @@ -164955,118 +165055,18 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: v_writelane_b32 v61, s5, 25 -; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: v_writelane_b32 v62, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr5 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: v_writelane_b32 v61, s5, 27 -; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: v_writelane_b32 v61, s20, 28 -; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: v_writelane_b32 v61, s21, 29 ; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s20, 30 ; SI-NEXT: v_writelane_b32 v61, s21, 31 -; SI-NEXT: v_mov_b32_e32 v44, v1 -; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: v_writelane_b32 v61, s88, 32 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v52, v17 -; SI-NEXT: v_mov_b32_e32 v43, v20 -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: v_mov_b32_e32 v41, v5 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: v_writelane_b32 v61, s89, 33 -; SI-NEXT: v_mov_b32_e32 v60, v35 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v35, v6 -; SI-NEXT: v_mov_b32_e32 v32, v4 -; SI-NEXT: v_mov_b32_e32 v30, v12 -; SI-NEXT: v_mov_b32_e32 v19, v39 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v39, v25 -; SI-NEXT: v_mov_b32_e32 v12, v29 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_mov_b32_e32 v6, v55 -; SI-NEXT: v_mov_b32_e32 v17, v8 -; SI-NEXT: v_mov_b32_e32 v29, v33 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB91_3: ; %Flow ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -165091,29 +165091,30 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 ; SI-NEXT: v_readfirstlane_b32 s6, v9 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 ; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 -; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; SI-NEXT: v_writelane_b32 v61, s6, 26 ; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 -; SI-NEXT: v_writelane_b32 v61, s6, 26 +; SI-NEXT: v_writelane_b32 v61, s7, 27 ; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 -; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v3 @@ -165134,7 +165135,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_writelane_b32 v61, s7, 27 ; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 ; SI-NEXT: s_mov_b32 s17, s26 ; SI-NEXT: s_mov_b32 s11, s20 @@ -165585,8 +165585,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v62, 0 -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 1 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 2 ; SI-NEXT: v_readlane_b32 s9, v62, 3 @@ -165613,10 +165613,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s86, 0xff +; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 9 @@ -165644,8 +165644,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s80, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 13 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 14 ; SI-NEXT: v_readlane_b32 s9, v62, 15 @@ -165674,8 +165674,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 18 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 19 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 17 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 20 ; SI-NEXT: v_readlane_b32 s9, v62, 21 @@ -165704,8 +165705,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 24 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s52, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 25 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 23 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 26 ; SI-NEXT: v_readlane_b32 s9, v62, 27 @@ -165734,8 +165736,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 30 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s30, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s61, v62, 29 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 32 ; SI-NEXT: v_readlane_b32 s9, v62, 33 @@ -165764,8 +165767,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 36 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 37 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 38 ; SI-NEXT: v_readlane_b32 s9, v62, 39 @@ -165794,8 +165797,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 42 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s92, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 43 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s43, v62, 41 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 44 ; SI-NEXT: v_readlane_b32 s9, v62, 45 @@ -165824,8 +165828,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 48 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 50 ; SI-NEXT: v_readlane_b32 s9, v62, 51 @@ -165854,8 +165858,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 55 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 53 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 56 ; SI-NEXT: v_readlane_b32 s9, v62, 57 @@ -165884,8 +165889,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 61 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v62, 59 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 62 ; SI-NEXT: v_readlane_b32 s9, v62, 63 @@ -165914,8 +165920,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 2 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v61, 3 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s27, v61, 1 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 4 ; SI-NEXT: v_readlane_b32 s9, v61, 5 @@ -165944,8 +165951,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v61, 9 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 10 ; SI-NEXT: v_readlane_b32 s9, v61, 11 @@ -165974,8 +165981,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 14 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v61, 15 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 16 ; SI-NEXT: v_readlane_b32 s9, v61, 17 @@ -166000,24 +166007,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s19, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_readlane_b32 s9, v61, 21 -; SI-NEXT: v_readlane_b32 s9, v61, 23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s61, v62, 17 -; SI-NEXT: v_readlane_b32 s27, v62, 53 -; SI-NEXT: v_readlane_b32 s61, v62, 23 -; SI-NEXT: v_readlane_b32 s27, v62, 59 -; SI-NEXT: v_readlane_b32 s61, v62, 29 -; SI-NEXT: v_readlane_b32 s43, v62, 41 -; SI-NEXT: v_readlane_b32 s27, v61, 1 ; SI-NEXT: v_readlane_b32 s61, v62, 35 ; SI-NEXT: v_readlane_b32 s43, v62, 47 ; SI-NEXT: v_readlane_b32 s27, v61, 7 ; SI-NEXT: v_readlane_b32 s21, v61, 13 ; SI-NEXT: v_readlane_b32 s17, v61, 19 -; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -166060,9 +166057,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 20 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s10, 0xff +; SI-NEXT: v_readlane_b32 s9, v61, 21 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 22 +; SI-NEXT: v_readlane_b32 s9, v61, 23 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: v_readlane_b32 s10, v61, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -166086,16 +166085,16 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s9, v61, 29 -; SI-NEXT: v_readlane_b32 s9, v61, 31 -; SI-NEXT: v_readlane_b32 s9, v61, 33 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 28 +; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v61, 32 @@ -166117,6 +166116,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -167707,6 +167707,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s6, 0xff ; VI-NEXT: v_readlane_b32 s6, v22, 49 +; VI-NEXT: v_readlane_b32 s9, v22, 5 ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_readlane_b32 s6, v22, 48 @@ -167763,7 +167764,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_readlane_b32 s9, v22, 5 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -167814,41 +167814,42 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: .LBB91_4: ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 +; VI-NEXT: ; implicit-def: $sgpr63 +; VI-NEXT: ; implicit-def: $sgpr62 +; VI-NEXT: ; implicit-def: $sgpr73 +; VI-NEXT: ; implicit-def: $sgpr72 +; VI-NEXT: ; implicit-def: $sgpr75 +; VI-NEXT: ; implicit-def: $sgpr74 +; VI-NEXT: ; implicit-def: $sgpr77 +; VI-NEXT: ; implicit-def: $sgpr76 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: v_writelane_b32 v22, s60, 0 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: v_writelane_b32 v22, s62, 2 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: v_writelane_b32 v22, s63, 3 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: v_writelane_b32 v22, s72, 4 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: v_writelane_b32 v22, s73, 5 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: v_writelane_b32 v22, s74, 6 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: v_writelane_b32 v22, s75, 7 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: v_writelane_b32 v22, s76, 8 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 @@ -167862,7 +167863,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr53 -; VI-NEXT: ; implicit-def: $sgpr77 ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr78 @@ -169890,11 +169890,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: .LBB91_2: ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $sgpr43 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: s_mov_b32 s104, -1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -169997,6 +169997,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v20, s42, 0 +; GFX11-NEXT: v_writelane_b32 v20, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -170006,15 +170007,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s43, 1 +; GFX11-NEXT: v_writelane_b32 v20, s46, 2 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s46, 2 -; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: v_writelane_b32 v20, s47, 3 +; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 4 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 6 ; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 7 ; GFX11-NEXT: .LBB91_3: ; %Flow @@ -170854,82 +170854,82 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 ; GFX11-NEXT: s_lshr_b32 s75, s42, 8 ; GFX11-NEXT: v_writelane_b32 v20, s58, 0 -; GFX11-NEXT: s_lshr_b32 s58, s63, 24 ; GFX11-NEXT: s_lshr_b32 s26, s26, 16 ; GFX11-NEXT: s_lshr_b32 s65, s73, 24 ; GFX11-NEXT: s_pack_ll_b32_b16 s90, s26, s90 +; GFX11-NEXT: s_lshr_b32 s82, s73, 8 ; GFX11-NEXT: v_writelane_b32 v20, s59, 1 +; GFX11-NEXT: s_lshr_b32 s58, s63, 24 ; GFX11-NEXT: s_lshr_b32 s59, s63, 8 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 ; GFX11-NEXT: s_lshr_b32 s63, s93, 24 -; GFX11-NEXT: s_lshr_b32 s82, s73, 8 +; GFX11-NEXT: s_lshr_b32 s84, s72, 16 ; GFX11-NEXT: v_writelane_b32 v20, s63, 21 ; GFX11-NEXT: s_lshr_b32 s63, s93, 8 -; GFX11-NEXT: s_lshr_b32 s84, s72, 16 ; GFX11-NEXT: s_lshr_b32 s51, s72, 8 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 +; GFX11-NEXT: s_lshr_b32 s86, s77, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 22 ; GFX11-NEXT: s_lshr_b32 s63, s92, 16 -; GFX11-NEXT: s_lshr_b32 s86, s77, 24 ; GFX11-NEXT: s_lshr_b32 s87, s77, 8 ; GFX11-NEXT: s_lshr_b32 s52, s76, 16 +; GFX11-NEXT: s_lshr_b32 s100, s76, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 23 ; GFX11-NEXT: s_lshr_b32 s63, s92, 8 -; GFX11-NEXT: s_lshr_b32 s100, s76, 8 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 ; GFX11-NEXT: s_lshr_b32 s101, s89, 8 +; GFX11-NEXT: s_lshr_b32 s98, s79, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 24 ; GFX11-NEXT: s_lshr_b32 s63, s95, 24 -; GFX11-NEXT: s_lshr_b32 s98, s79, 24 ; GFX11-NEXT: s_lshr_b32 s99, s79, 8 ; GFX11-NEXT: s_lshr_b32 s53, s78, 16 +; GFX11-NEXT: s_lshr_b32 s97, s78, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 25 ; GFX11-NEXT: s_lshr_b32 s63, s95, 8 -; GFX11-NEXT: s_lshr_b32 s97, s78, 8 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 +; GFX11-NEXT: s_lshr_b32 s102, s94, 16 ; GFX11-NEXT: v_writelane_b32 v20, s63, 26 ; GFX11-NEXT: s_lshr_b32 s63, s43, 24 -; GFX11-NEXT: s_lshr_b32 s102, s94, 16 ; GFX11-NEXT: s_lshr_b32 s103, s94, 8 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 +; GFX11-NEXT: s_lshr_b32 s73, s91, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 27 ; GFX11-NEXT: s_lshr_b32 s63, s43, 8 -; GFX11-NEXT: s_lshr_b32 s73, s91, 24 ; GFX11-NEXT: s_lshr_b32 s77, s91, 8 ; GFX11-NEXT: s_lshr_b32 s83, s90, 8 +; GFX11-NEXT: s_lshr_b32 s66, s37, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 28 ; GFX11-NEXT: s_lshr_b32 s63, s42, 16 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[42:43], 24 -; GFX11-NEXT: s_lshr_b32 s66, s37, 24 ; GFX11-NEXT: s_lshr_b32 s67, s37, 8 -; GFX11-NEXT: v_writelane_b32 v20, s42, 6 ; GFX11-NEXT: s_lshr_b32 s68, s36, 16 +; GFX11-NEXT: v_writelane_b32 v20, s42, 6 ; GFX11-NEXT: s_lshr_b32 s49, s36, 8 ; GFX11-NEXT: s_lshr_b32 s69, s35, 24 ; GFX11-NEXT: s_lshr_b32 s70, s35, 8 +; GFX11-NEXT: s_lshr_b32 s64, s34, 16 ; GFX11-NEXT: v_writelane_b32 v20, s43, 7 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[44:45], 24 -; GFX11-NEXT: s_lshr_b32 s64, s34, 16 ; GFX11-NEXT: s_lshr_b32 s80, s34, 8 ; GFX11-NEXT: s_lshr_b32 s79, s45, 24 -; GFX11-NEXT: v_writelane_b32 v20, s42, 4 -; GFX11-NEXT: s_lshr_b32 s42, s89, 24 ; GFX11-NEXT: s_lshr_b32 s93, s45, 8 +; GFX11-NEXT: v_writelane_b32 v20, s42, 4 ; GFX11-NEXT: s_lshr_b32 s95, s44, 16 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s44, 8 -; GFX11-NEXT: v_writelane_b32 v20, s43, 5 -; GFX11-NEXT: s_lshr_b32 s43, s88, 16 ; GFX11-NEXT: s_lshr_b32 s34, s47, 24 ; GFX11-NEXT: s_lshr_b32 s55, s47, 8 +; GFX11-NEXT: v_writelane_b32 v20, s43, 5 +; GFX11-NEXT: s_lshr_b32 s43, s88, 16 +; GFX11-NEXT: s_lshr_b32 s42, s89, 24 ; GFX11-NEXT: s_lshr_b32 s35, s46, 16 +; GFX11-NEXT: s_lshr_b32 s36, s46, 8 ; GFX11-NEXT: v_writelane_b32 v20, s43, 29 ; GFX11-NEXT: s_lshr_b32 s43, s88, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 ; GFX11-NEXT: s_lshr_b32 s89, s90, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 ; GFX11-NEXT: v_writelane_b32 v20, s43, 30 -; GFX11-NEXT: s_lshr_b32 s36, s46, 8 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 ; GFX11-NEXT: s_lshr_b32 s37, s57, 24 ; GFX11-NEXT: s_lshr_b32 s38, s57, 8 @@ -171042,9 +171042,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_lshl_b32 s19, s73, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s17, v20, 1 +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: s_lshl_b32 s17, s70, 8 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 @@ -171056,14 +171056,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_readlane_b32 s16, v20, 0 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_readlane_b32 s17, v20, 1 ; GFX11-NEXT: s_or_b32 s1, s1, s2 ; GFX11-NEXT: v_readlane_b32 s2, v20, 18 ; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: v_readlane_b32 s96, v19, 0 -; GFX11-NEXT: v_readlane_b32 s81, v18, 25 +; GFX11-NEXT: s_lshl_b32 s17, s70, 8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: v_readlane_b32 s70, v18, 22 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: v_readlane_b32 s69, v18, 21 @@ -171092,21 +171092,22 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s17, s17, s18 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: v_readlane_b32 s17, v20, 3 +; GFX11-NEXT: s_and_b32 s18, s71, 0xff ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: v_readlane_b32 s16, v20, 2 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 ; GFX11-NEXT: s_and_b32 s2, s68, 0xff -; GFX11-NEXT: s_lshl_b32 s17, s66, 8 +; GFX11-NEXT: v_readlane_b32 s17, v20, 3 ; GFX11-NEXT: s_lshl_b32 s3, s16, 8 ; GFX11-NEXT: v_readlane_b32 s16, v20, 20 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s67, 8 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s17, s66, 8 ; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -171119,10 +171120,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s27, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s77, 8 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 @@ -188515,8 +188515,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v38 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v37 ; SI-NEXT: s_or_b32 s42, s5, s4 @@ -188550,8 +188550,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v20 ; SI-NEXT: s_or_b32 s28, s5, s4 @@ -188631,8 +188631,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v29 ; SI-NEXT: s_or_b32 s22, s5, s4 @@ -188648,8 +188648,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_readfirstlane_b32 s4, v58 ; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: v_readfirstlane_b32 s4, v58 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s20, s5, s4 @@ -188814,104 +188814,97 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: v_mov_b32_e32 v51, v42 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v41, v21 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v21, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v40, v34 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v13, v12 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v57, v30 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v58, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v30, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v31, v10 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v54, v9 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v17, v43 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v25, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v1, v52 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_writelane_b32 v62, s80, 46 -; SI-NEXT: v_writelane_b32 v62, s81, 47 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_writelane_b32 v62, s80, 48 -; SI-NEXT: v_mov_b32_e32 v51, v42 -; SI-NEXT: v_mov_b32_e32 v41, v21 -; SI-NEXT: v_mov_b32_e32 v21, v24 -; SI-NEXT: v_mov_b32_e32 v40, v34 -; SI-NEXT: v_mov_b32_e32 v34, v61 -; SI-NEXT: v_mov_b32_e32 v13, v12 -; SI-NEXT: v_mov_b32_e32 v12, v48 -; SI-NEXT: v_mov_b32_e32 v57, v30 -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v46, v33 -; SI-NEXT: v_mov_b32_e32 v30, v32 -; SI-NEXT: v_mov_b32_e32 v32, v31 -; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v54, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v22, v2 -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: s_mov_b64 vcc, -1 -; SI-NEXT: v_writelane_b32 v62, s81, 49 -; SI-NEXT: v_mov_b32_e32 v25, v59 -; SI-NEXT: v_mov_b32_e32 v1, v52 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -188981,7 +188974,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: v_writelane_b32 v62, s80, 46 +; SI-NEXT: v_writelane_b32 v62, s81, 47 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s80, 48 +; SI-NEXT: v_writelane_b32 v62, s81, 49 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: .LBB95_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v14, v17 @@ -189515,9 +189515,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b32 s71, s45, 8 ; SI-NEXT: .LBB95_5: ; %end ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 ; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 -; SI-NEXT: s_and_b32 s44, s44, 0xff ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 +; SI-NEXT: s_and_b32 s44, s44, 0xff +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 ; SI-NEXT: s_or_b32 s44, s44, s47 ; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 @@ -189538,9 +189540,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v13, v21, v13 ; SI-NEXT: v_or_b32_e32 v13, s44, v13 ; SI-NEXT: v_readlane_b32 s44, v62, 6 +; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_lshl_b32 s44, s44, 8 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 s45, v62, 7 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: s_or_b32 s42, s42, s44 ; SI-NEXT: v_readlane_b32 s44, v62, 8 ; SI-NEXT: v_readlane_b32 s45, v62, 9 @@ -189562,9 +189565,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v12, v23, v12 ; SI-NEXT: v_or_b32_e32 v12, s42, v12 ; SI-NEXT: v_readlane_b32 s42, v62, 12 +; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_lshl_b32 s42, s42, 8 ; SI-NEXT: s_and_b32 s40, s40, 0xff -; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_or_b32 s40, s40, s42 ; SI-NEXT: v_readlane_b32 s42, v62, 14 ; SI-NEXT: v_readlane_b32 s43, v62, 15 @@ -189586,9 +189589,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v11, v24, v11 ; SI-NEXT: v_or_b32_e32 v11, s40, v11 ; SI-NEXT: v_readlane_b32 s40, v62, 18 +; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_lshl_b32 s40, s40, 8 ; SI-NEXT: s_and_b32 s28, s28, 0xff -; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_or_b32 s28, s28, s40 ; SI-NEXT: v_readlane_b32 s40, v62, 20 ; SI-NEXT: v_readlane_b32 s41, v62, 21 @@ -189610,9 +189613,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v25, s28, v25 ; SI-NEXT: v_readlane_b32 s28, v62, 24 +; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_lshl_b32 s28, s28, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff -; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_or_b32 s26, s26, s28 ; SI-NEXT: v_readlane_b32 s28, v62, 26 ; SI-NEXT: v_readlane_b32 s29, v62, 27 @@ -189634,17 +189637,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v27, s26, v27 ; SI-NEXT: v_readlane_b32 s26, v62, 30 +; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_lshl_b32 s26, s26, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 -; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_or_b32 s24, s24, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 ; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: s_and_b32 s26, s26, 0xff ; SI-NEXT: v_readlane_b32 s28, v62, 34 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: s_lshl_b32 s27, s28, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 @@ -189678,11 +189678,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: v_readlane_b32 s24, v62, 38 ; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen @@ -193194,6 +193194,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -193204,10 +193205,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -193270,6 +193269,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB95_2 ; GFX11-NEXT: .LBB95_4: @@ -193352,10 +193352,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v78, 18 ; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 ; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v42, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 ; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 @@ -193415,9 +193414,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB95_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 @@ -211589,6 +211589,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s17, 23 ; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 ; SI-NEXT: v_writelane_b32 v41, s16, 20 +; SI-NEXT: v_writelane_b32 v41, s17, 21 ; SI-NEXT: s_lshr_b32 s16, s61, 24 ; SI-NEXT: v_writelane_b32 v43, s16, 18 ; SI-NEXT: s_lshr_b32 s16, s61, 16 @@ -211690,7 +211691,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s16, s5, 16 ; SI-NEXT: v_writelane_b32 v42, s16, 0 ; SI-NEXT: s_lshr_b32 s16, s5, 8 -; SI-NEXT: v_writelane_b32 v41, s17, 21 ; SI-NEXT: v_writelane_b32 v42, s16, 1 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 ; SI-NEXT: v_writelane_b32 v41, s16, 28 @@ -211833,6 +211833,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s17, v41, 25 ; SI-NEXT: s_lshl_b32 s17, s16, 8 ; SI-NEXT: s_and_b32 s18, s56, 0xff +; SI-NEXT: v_readlane_b32 s21, v41, 23 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_readlane_b32 s18, v41, 26 ; SI-NEXT: v_readlane_b32 s19, v41, 27 @@ -211974,6 +211975,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: s_lshl_b32 s17, s90, 8 ; SI-NEXT: s_and_b32 s18, s46, 0xff +; SI-NEXT: v_readlane_b32 s21, v41, 29 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_readlane_b32 s18, v41, 30 ; SI-NEXT: v_readlane_b32 s19, v41, 31 @@ -211992,78 +211994,82 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s16, 0xff ; SI-NEXT: v_readlane_b32 s16, v43, 39 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_readlane_b32 s16, v41, 34 ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s17, v41, 35 +; SI-NEXT: v_readlane_b32 s18, v41, 36 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 34 -; SI-NEXT: v_readlane_b32 s17, v41, 35 -; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: v_readlane_b32 s18, v41, 36 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s19, v41, 37 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 38 -; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 44 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 -; SI-NEXT: v_readlane_b32 s17, v43, 44 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: v_readlane_b32 s17, v43, 43 -; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: v_readlane_b32 s18, v43, 42 -; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 40 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s19, v41, 41 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 42 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: v_readlane_b32 s19, v41, 43 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 44 @@ -212094,8 +212100,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 46 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: v_readlane_b32 s17, v41, 47 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s19, v41, 45 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 48 ; SI-NEXT: v_readlane_b32 s17, v41, 49 @@ -212128,8 +212135,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 52 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: v_readlane_b32 s15, v41, 53 +; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 54 ; SI-NEXT: v_readlane_b32 s15, v41, 55 @@ -212162,8 +212169,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 58 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: v_readlane_b32 s13, v41, 59 +; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 60 ; SI-NEXT: v_readlane_b32 s13, v41, 61 @@ -212196,8 +212203,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 0 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: v_readlane_b32 s11, v43, 1 +; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 2 ; SI-NEXT: v_readlane_b32 s11, v43, 3 @@ -212230,8 +212237,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 6 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: v_readlane_b32 s9, v43, 7 +; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 8 ; SI-NEXT: v_readlane_b32 s9, v43, 9 @@ -212264,8 +212271,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: s_lshl_b32 s6, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 14 ; SI-NEXT: v_readlane_b32 s7, v43, 15 @@ -212285,21 +212292,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_readlane_b32 s5, v42, 0 -; SI-NEXT: v_readlane_b32 s19, v41, 37 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: v_readlane_b32 s6, v43, 63 -; SI-NEXT: v_readlane_b32 s19, v41, 39 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_readlane_b32 s19, v41, 41 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s21, v41, 23 -; SI-NEXT: v_readlane_b32 s19, v41, 43 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s21, v41, 29 -; SI-NEXT: v_readlane_b32 s19, v41, 45 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -212357,14 +212357,47 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v41, s4, 20 ; SI-NEXT: v_writelane_b32 v41, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: v_writelane_b32 v41, s4, 22 +; SI-NEXT: v_writelane_b32 v41, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s5, 23 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 @@ -212376,6 +212409,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v41, s4, 28 +; SI-NEXT: v_writelane_b32 v41, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -212463,7 +212497,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v41, s5, 29 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 @@ -212517,6 +212550,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v41, s4, 62 +; SI-NEXT: v_writelane_b32 v41, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v43, s4, 0 @@ -212536,48 +212570,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s4, 10 ; SI-NEXT: v_writelane_b32 v43, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v43, s16, 12 ; SI-NEXT: v_writelane_b32 v43, s17, 13 ; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s16, 14 ; SI-NEXT: v_writelane_b32 v43, s17, 15 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v43, s16, 16 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: v_writelane_b32 v41, s5, 63 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s17, 17 ; SI-NEXT: s_branch .LBB99_2 ; @@ -215550,6 +215550,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 +; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -215560,10 +215561,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -215626,6 +215625,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB99_2 ; GFX11-NEXT: .LBB99_4: @@ -215708,10 +215708,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v78, 18 ; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 ; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v42, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 19 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 ; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 @@ -215771,9 +215770,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 +; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB99_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 9c05297f7bcae..3e2b488d02f37 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -66739,9 +66739,11 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s17, 13 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_readlane_b32 s18, v21, 0 -; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: v_readlane_b32 s19, v21, 1 ; SI-NEXT: s_lshl_b32 s17, s18, 8 ; SI-NEXT: v_readlane_b32 s18, v21, 2 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v21, 4 @@ -66763,8 +66765,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: v_readlane_b32 s17, v21, 7 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s19, v21, 5 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 8 ; SI-NEXT: v_readlane_b32 s17, v21, 9 @@ -66796,8 +66799,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 12 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: v_readlane_b32 s15, v21, 13 +; SI-NEXT: s_lshl_b32 s14, s14, 8 ; SI-NEXT: s_or_b32 s10, s10, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 14 ; SI-NEXT: v_readlane_b32 s15, v21, 15 @@ -66948,13 +66951,10 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s91, 24 -; SI-NEXT: v_readlane_b32 s19, v21, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s19, v21, 5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -67009,28 +67009,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s4, 0 ; SI-NEXT: v_writelane_b32 v21, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 2 -; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 16 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr74 @@ -67058,7 +67036,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: v_writelane_b32 v21, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 @@ -67075,10 +67052,33 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: v_writelane_b32 v21, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 4 +; SI-NEXT: v_writelane_b32 v21, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 6 +; SI-NEXT: v_writelane_b32 v21, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 8 +; SI-NEXT: v_writelane_b32 v21, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 10 +; SI-NEXT: v_writelane_b32 v21, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 12 +; SI-NEXT: v_writelane_b32 v21, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 14 +; SI-NEXT: v_writelane_b32 v21, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 16 +; SI-NEXT: v_writelane_b32 v21, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: @@ -88402,8 +88402,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s4, 0 -; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: v_writelane_b32 v41, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v6 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 34abba10f6c61..37cbd2d926413 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -150,8 +150,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_mov_b32_e32 v1, vcc_lo ; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, vcc_lo ; GCN-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index 744871d8c84ff..5f36d5403ebcf 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -65,12 +65,12 @@ body: | ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -141,12 +141,12 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 420f003d4f417..76f204dd0c16a 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -41,10 +41,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v6, s70, 20 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v6, s71, 21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v7, s8, 0 ; CHECK-NEXT: v_writelane_b32 v7, s9, 1 ; CHECK-NEXT: v_writelane_b32 v7, s10, 2 @@ -76,14 +76,15 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s64, 28 ; CHECK-NEXT: v_writelane_b32 v7, s65, 29 ; CHECK-NEXT: v_writelane_b32 v7, s66, 30 -; CHECK-NEXT: v_writelane_b32 v7, s67, 31 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_writelane_b32 v7, s67, 31 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s52, v7, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -91,13 +92,12 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 5 ; CHECK-NEXT: v_readlane_b32 s58, v7, 6 ; CHECK-NEXT: v_readlane_b32 s59, v7, 7 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v5 ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 -; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s62, v7, 10 +; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 @@ -109,6 +109,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 +; CHECK-NEXT: v_readlane_b32 s52, v7, 16 ; CHECK-NEXT: v_readlane_b32 s60, v7, 24 ; CHECK-NEXT: v_readlane_b32 s61, v7, 25 ; CHECK-NEXT: v_readlane_b32 s62, v7, 26 @@ -119,11 +120,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 -; CHECK-NEXT: v_readlane_b32 s52, v7, 16 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s55, v7, 19 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 @@ -152,18 +152,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 ; CHECK-NEXT: v_mov_b32_e32 v0, s16 -; CHECK-NEXT: v_readlane_b32 s52, v7, 24 -; CHECK-NEXT: v_readlane_b32 s53, v7, 25 -; CHECK-NEXT: v_readlane_b32 s54, v7, 26 -; CHECK-NEXT: v_readlane_b32 s55, v7, 27 -; CHECK-NEXT: v_readlane_b32 s56, v7, 28 -; CHECK-NEXT: v_readlane_b32 s57, v7, 29 -; CHECK-NEXT: v_readlane_b32 s58, v7, 30 -; CHECK-NEXT: v_readlane_b32 s59, v7, 31 +; CHECK-NEXT: v_readlane_b32 s44, v7, 16 ; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: v_readlane_b32 s44, v7, 16 ; CHECK-NEXT: v_readlane_b32 s45, v7, 17 ; CHECK-NEXT: v_readlane_b32 s46, v7, 18 ; CHECK-NEXT: v_readlane_b32 s47, v7, 19 @@ -171,6 +163,16 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s49, v7, 21 ; CHECK-NEXT: v_readlane_b32 s50, v7, 22 ; CHECK-NEXT: v_readlane_b32 s51, v7, 23 +; CHECK-NEXT: v_readlane_b32 s52, v7, 24 +; CHECK-NEXT: v_readlane_b32 s53, v7, 25 +; CHECK-NEXT: v_readlane_b32 s54, v7, 26 +; CHECK-NEXT: v_readlane_b32 s55, v7, 27 +; CHECK-NEXT: v_readlane_b32 s56, v7, 28 +; CHECK-NEXT: v_readlane_b32 s57, v7, 29 +; CHECK-NEXT: v_readlane_b32 s58, v7, 30 +; CHECK-NEXT: v_readlane_b32 s59, v7, 31 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s44, v7, 0 ; CHECK-NEXT: v_readlane_b32 s52, v7, 8 ; CHECK-NEXT: v_readlane_b32 s53, v7, 9 ; CHECK-NEXT: v_readlane_b32 s54, v7, 10 @@ -179,14 +181,12 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 13 ; CHECK-NEXT: v_readlane_b32 s58, v7, 14 ; CHECK-NEXT: v_readlane_b32 s59, v7, 15 -; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, v3 -; CHECK-NEXT: v_readlane_b32 s44, v7, 0 ; CHECK-NEXT: v_readlane_b32 s45, v7, 1 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s46, v7, 2 ; CHECK-NEXT: v_readlane_b32 s47, v7, 3 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s48, v7, 4 ; CHECK-NEXT: v_readlane_b32 s49, v7, 5 ; CHECK-NEXT: v_readlane_b32 s50, v7, 6 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 4a89b2fcc017c..e1b4cad370f96 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1826,10 +1826,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s0, s0, s5 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_readlane_b32 s1, v6, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_readlane_b32 s0, v6, 0 +; GCN-NEXT: v_readlane_b32 s1, v6, 1 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 689b38846c61b..a21a405164b6a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10334,11 +10334,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_writelane_b32 v62, s3, 5 ; GFX8-NEXT: v_readlane_b32 s2, v62, 2 ; GFX8-NEXT: v_readlane_b32 s3, v62, 3 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_mov_b32_e32 v35, s49 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 ; GFX8-NEXT: v_mov_b32_e32 v13, s73 @@ -10632,8 +10632,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_readlane_b32 s2, v62, 4 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_readlane_b32 s3, v62, 5 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 58375b6f8a8a4..75638c5fa8476 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -139,13 +139,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 ; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s15, s21 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_mov_b32_e32 v40, v32 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -232,20 +232,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_readlane_b32 s4, v39, 10 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 -; GFX906-NEXT: v_readlane_b32 s4, v39, 10 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 -; GFX906-NEXT: v_readlane_b32 s6, v39, 8 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9 -; GFX906-NEXT: v_readlane_b32 s8, v39, 6 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7 -; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -253,19 +253,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s5, v39, 11 ; GFX906-NEXT: v_readlane_b32 s6, v39, 8 -; GFX906-NEXT: v_readlane_b32 s7, v39, 9 ; GFX906-NEXT: v_readlane_b32 s8, v39, 6 -; GFX906-NEXT: v_readlane_b32 s9, v39, 7 ; GFX906-NEXT: v_readlane_b32 s10, v39, 4 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 +; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s9, v39, 7 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_readlane_b32 s21, v39, 12 ; GFX906-NEXT: ;;#ASMSTART @@ -528,13 +528,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 ; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_mov_b32_e32 v40, v32 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -621,20 +621,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s4, v39, 10 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 -; GFX908-NEXT: v_readlane_b32 s4, v39, 10 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 -; GFX908-NEXT: v_readlane_b32 s6, v39, 8 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9 -; GFX908-NEXT: v_readlane_b32 s8, v39, 6 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7 -; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -642,19 +642,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s5, v39, 11 ; GFX908-NEXT: v_readlane_b32 s6, v39, 8 -; GFX908-NEXT: v_readlane_b32 s7, v39, 9 ; GFX908-NEXT: v_readlane_b32 s8, v39, 6 -; GFX908-NEXT: v_readlane_b32 s9, v39, 7 ; GFX908-NEXT: v_readlane_b32 s10, v39, 4 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 +; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s9, v39, 7 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_readlane_b32 s21, v39, 12 ; GFX908-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index d1dee534414ac..15f5f890d57b5 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -8,7 +8,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-LABEL: kernel0: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART @@ -20,9 +19,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[48:51] ; CHECK-NEXT: ;;#ASMEND @@ -123,19 +123,19 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v22, s0, 58 ; CHECK-NEXT: v_writelane_b32 v22, s1, 59 ; CHECK-NEXT: v_writelane_b32 v22, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v22, s3, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v22, s0, 62 ; CHECK-NEXT: v_writelane_b32 v23, s2, 0 ; CHECK-NEXT: v_writelane_b32 v23, s3, 1 ; CHECK-NEXT: v_writelane_b32 v23, s4, 2 ; CHECK-NEXT: v_writelane_b32 v23, s5, 3 ; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v22, s0, 62 -; CHECK-NEXT: v_writelane_b32 v23, s7, 5 ; CHECK-NEXT: v_writelane_b32 v22, s1, 63 +; CHECK-NEXT: v_writelane_b32 v23, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND @@ -208,9 +208,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 2 ; CHECK-NEXT: v_readlane_b32 s1, v22, 3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s2, v22, 4 ; CHECK-NEXT: v_readlane_b32 s3, v22, 5 ; CHECK-NEXT: v_readlane_b32 s4, v22, 6 @@ -218,6 +215,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s6, v22, 8 ; CHECK-NEXT: v_readlane_b32 s7, v22, 9 ; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 10 @@ -241,12 +241,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 26 ; CHECK-NEXT: v_readlane_b32 s1, v22, 27 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[38:39] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s2, v22, 28 ; CHECK-NEXT: v_readlane_b32 s3, v22, 29 ; CHECK-NEXT: v_readlane_b32 s4, v22, 30 @@ -254,10 +248,22 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s6, v22, 32 ; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[38:39] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 34 ; CHECK-NEXT: v_readlane_b32 s1, v22, 35 +; CHECK-NEXT: v_readlane_b32 s2, v22, 36 +; CHECK-NEXT: v_readlane_b32 s3, v22, 37 +; CHECK-NEXT: v_readlane_b32 s4, v22, 38 +; CHECK-NEXT: v_readlane_b32 s5, v22, 39 +; CHECK-NEXT: v_readlane_b32 s6, v22, 40 +; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND @@ -267,12 +273,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[40:43] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s2, v22, 36 -; CHECK-NEXT: v_readlane_b32 s3, v22, 37 -; CHECK-NEXT: v_readlane_b32 s4, v22, 38 -; CHECK-NEXT: v_readlane_b32 s5, v22, 39 -; CHECK-NEXT: v_readlane_b32 s6, v22, 40 -; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -297,11 +297,11 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 58 ; CHECK-NEXT: v_readlane_b32 s1, v22, 59 +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s2, v22, 60 -; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 6b0ede1ac3ab8..e7bc851817f3a 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -278,10 +278,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 ; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 ; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 @@ -587,11 +587,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 57ddcb20d613c..e9a0671ead4e0 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -158,6 +158,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 2 ; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 +; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 6a3a58e3ab120..72672c8b6efad 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -135,6 +135,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 +; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill @@ -964,6 +965,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 +; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill