From 51695f19156e7d0d9c07c66b5bd19d09be6a3a46 Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 24 Nov 2025 12:18:12 -0600 Subject: [PATCH 1/5] Revert "Revert "[AMDGPU] Remove leftover implicit operands from SI_SPILL/SI_RESTORE." (#169068)" This reverts commit 4511c355c35153c6b8f5fd3d0b75f77c126fe8e6. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 1272 ++++++++--------- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 62 +- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll | 2 +- .../CodeGen/AMDGPU/fold-reload-into-exec.mir | 16 +- .../identical-subrange-spill-infloop.ll | 38 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 4 +- llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 4 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 40 +- .../scc-clobbered-sgpr-to-vmem-spill.ll | 44 +- .../AMDGPU/tuple-allocation-failure.ll | 8 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 1 - llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 2 - 13 files changed, 747 insertions(+), 750 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a7333e3373f38..66e1873319553 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2094,11 +2094,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; case AMDGPU::SI_SPILL_S32_TO_VGPR: - MI.setDesc(get(AMDGPU::V_WRITELANE_B32)); + mutateAndCleanupImplicit(MI, get(AMDGPU::V_WRITELANE_B32)); break; case AMDGPU::SI_RESTORE_S32_FROM_VGPR: - MI.setDesc(get(AMDGPU::V_READLANE_B32)); + mutateAndCleanupImplicit(MI, get(AMDGPU::V_READLANE_B32)); break; case AMDGPU::AV_MOV_B32_IMM_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 4c5c56a49fdc6..4519e2796fed3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -8181,8 +8181,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 0 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 @@ -8215,8 +8215,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 @@ -8249,8 +8249,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 @@ -8283,8 +8283,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 @@ -8317,8 +8317,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 @@ -8350,8 +8350,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 @@ -8384,8 +8384,8 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v23, 36 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v23, 37 ; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v23, 37 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v23, 38 ; SI-NEXT: v_readlane_b32 s7, v23, 39 @@ -8468,148 +8468,149 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s50, 0 -; SI-NEXT: v_writelane_b32 v23, s51, 1 -; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 1 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 2 -; SI-NEXT: v_writelane_b32 v23, s51, 3 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 2 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 3 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 4 -; SI-NEXT: v_writelane_b32 v23, s51, 5 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 4 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 5 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 6 -; SI-NEXT: v_writelane_b32 v23, s51, 7 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 6 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 7 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 8 -; SI-NEXT: v_writelane_b32 v23, s51, 9 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 8 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 9 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 10 -; SI-NEXT: v_writelane_b32 v23, s51, 11 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 10 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 11 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 12 -; SI-NEXT: v_writelane_b32 v23, s51, 13 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 12 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 13 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 14 -; SI-NEXT: v_writelane_b32 v23, s51, 15 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 14 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 15 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 16 -; SI-NEXT: v_writelane_b32 v23, s51, 17 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 16 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 17 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 18 -; SI-NEXT: v_writelane_b32 v23, s51, 19 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 18 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 19 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 20 -; SI-NEXT: v_writelane_b32 v23, s51, 21 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 20 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 21 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 22 -; SI-NEXT: v_writelane_b32 v23, s51, 23 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 22 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 23 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 24 -; SI-NEXT: v_writelane_b32 v23, s51, 25 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 24 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 25 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 26 -; SI-NEXT: v_writelane_b32 v23, s51, 27 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 27 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 28 -; SI-NEXT: v_writelane_b32 v23, s51, 29 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 28 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 29 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 30 -; SI-NEXT: v_writelane_b32 v23, s51, 31 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 30 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 31 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 32 -; SI-NEXT: v_writelane_b32 v23, s51, 33 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 32 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 33 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 34 -; SI-NEXT: v_writelane_b32 v23, s51, 35 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 34 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 35 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 36 -; SI-NEXT: v_writelane_b32 v23, s51, 37 ; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 36 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s51, 37 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: v_writelane_b32 v23, s50, 38 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s51, 39 -; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr54 @@ -8634,7 +8635,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s50, 40 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr86 @@ -10597,10 +10597,9 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: .LBB13_4: ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: v_writelane_b32 v22, s82, 0 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 @@ -10633,6 +10632,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 ; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -10985,12 +10985,10 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB13_3 ; GFX11-NEXT: .LBB13_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -11001,7 +10999,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11013,6 +11010,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11024,7 +11022,6 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11036,6 +11033,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -11044,10 +11042,12 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: s_mov_b32 s101, -1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr30 @@ -11111,17 +11111,17 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB13_3: ; %Flow @@ -44444,147 +44444,149 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v37 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s60, 0 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: v_writelane_b32 v61, s61, 1 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 2 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 2 ; SI-NEXT: v_writelane_b32 v61, s61, 3 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 4 ; SI-NEXT: v_writelane_b32 v61, s61, 5 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 6 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 6 ; SI-NEXT: v_writelane_b32 v61, s61, 7 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 8 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 8 ; SI-NEXT: v_writelane_b32 v61, s61, 9 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 10 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 10 ; SI-NEXT: v_writelane_b32 v61, s61, 11 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 12 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 12 ; SI-NEXT: v_writelane_b32 v61, s61, 13 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 14 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 14 ; SI-NEXT: v_writelane_b32 v61, s61, 15 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 16 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 16 ; SI-NEXT: v_writelane_b32 v61, s61, 17 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 18 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 18 ; SI-NEXT: v_writelane_b32 v61, s61, 19 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 20 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 20 ; SI-NEXT: v_writelane_b32 v61, s61, 21 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 22 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 22 ; SI-NEXT: v_writelane_b32 v61, s61, 23 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 24 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 24 ; SI-NEXT: v_writelane_b32 v61, s61, 25 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 26 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 26 ; SI-NEXT: v_writelane_b32 v61, s61, 27 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 28 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 28 ; SI-NEXT: v_writelane_b32 v61, s61, 29 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 30 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 30 ; SI-NEXT: v_writelane_b32 v61, s61, 31 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 32 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 32 ; SI-NEXT: v_writelane_b32 v61, s61, 33 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 34 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 34 ; SI-NEXT: v_writelane_b32 v61, s61, 35 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 36 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 36 ; SI-NEXT: v_writelane_b32 v61, s61, 37 -; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 38 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 38 ; SI-NEXT: v_writelane_b32 v61, s61, 39 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: v_writelane_b32 v61, s60, 40 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr96 ; SI-NEXT: ; implicit-def: $sgpr28 @@ -44593,8 +44595,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v61, s60, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 ; SI-NEXT: ; implicit-def: $sgpr98 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 @@ -44766,6 +44766,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_mov_b32_e32 v55, s97 ; SI-NEXT: v_mov_b32_e32 v54, s96 ; SI-NEXT: v_mov_b32_e32 v52, s60 ; SI-NEXT: v_mov_b32_e32 v47, s28 @@ -44786,48 +44787,47 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-NEXT: v_mov_b32_e32 v31, s46 ; SI-NEXT: v_mov_b32_e32 v36, s56 ; SI-NEXT: v_readlane_b32 s26, v61, 40 +; SI-NEXT: v_readlane_b32 s27, v61, 41 ; SI-NEXT: v_readlane_b32 s28, v61, 38 +; SI-NEXT: v_readlane_b32 s29, v61, 39 ; SI-NEXT: v_readlane_b32 s6, v61, 36 +; SI-NEXT: v_readlane_b32 s7, v61, 37 ; SI-NEXT: v_readlane_b32 s58, v61, 34 +; SI-NEXT: v_readlane_b32 s59, v61, 35 ; SI-NEXT: v_readlane_b32 s60, v61, 32 +; SI-NEXT: v_readlane_b32 s61, v61, 33 ; SI-NEXT: v_readlane_b32 s8, v61, 30 +; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: v_readlane_b32 s10, v61, 28 +; SI-NEXT: v_readlane_b32 s11, v61, 29 ; SI-NEXT: v_readlane_b32 s12, v61, 26 +; SI-NEXT: v_readlane_b32 s13, v61, 27 ; SI-NEXT: v_readlane_b32 s14, v61, 24 +; SI-NEXT: v_readlane_b32 s15, v61, 25 ; SI-NEXT: v_readlane_b32 s16, v61, 22 +; SI-NEXT: v_readlane_b32 s17, v61, 23 ; SI-NEXT: s_mov_b32 s96, s94 ; SI-NEXT: v_readlane_b32 s94, v61, 20 -; SI-NEXT: v_readlane_b32 s18, v61, 18 -; SI-NEXT: v_readlane_b32 s20, v61, 16 -; SI-NEXT: v_readlane_b32 s22, v61, 14 -; SI-NEXT: v_readlane_b32 s24, v61, 12 -; SI-NEXT: v_readlane_b32 s40, v61, 10 -; SI-NEXT: v_readlane_b32 s42, v61, 8 -; SI-NEXT: v_readlane_b32 s44, v61, 6 -; SI-NEXT: v_readlane_b32 s46, v61, 4 -; SI-NEXT: v_readlane_b32 s56, v61, 2 -; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0 -; SI-NEXT: v_mov_b32_e32 v55, s97 -; SI-NEXT: v_readlane_b32 s27, v61, 41 -; SI-NEXT: v_readlane_b32 s29, v61, 39 -; SI-NEXT: v_readlane_b32 s7, v61, 37 -; SI-NEXT: v_readlane_b32 s59, v61, 35 -; SI-NEXT: v_readlane_b32 s61, v61, 33 -; SI-NEXT: v_readlane_b32 s9, v61, 31 -; SI-NEXT: v_readlane_b32 s11, v61, 29 -; SI-NEXT: v_readlane_b32 s13, v61, 27 -; SI-NEXT: v_readlane_b32 s15, v61, 25 -; SI-NEXT: v_readlane_b32 s17, v61, 23 ; SI-NEXT: v_readlane_b32 s95, v61, 21 +; SI-NEXT: v_readlane_b32 s18, v61, 18 ; SI-NEXT: v_readlane_b32 s19, v61, 19 +; SI-NEXT: v_readlane_b32 s20, v61, 16 ; SI-NEXT: v_readlane_b32 s21, v61, 17 +; SI-NEXT: v_readlane_b32 s22, v61, 14 ; SI-NEXT: v_readlane_b32 s23, v61, 15 +; SI-NEXT: v_readlane_b32 s24, v61, 12 ; SI-NEXT: v_readlane_b32 s25, v61, 13 +; SI-NEXT: v_readlane_b32 s40, v61, 10 ; SI-NEXT: v_readlane_b32 s41, v61, 11 +; SI-NEXT: v_readlane_b32 s42, v61, 8 ; SI-NEXT: v_readlane_b32 s43, v61, 9 +; SI-NEXT: v_readlane_b32 s44, v61, 6 ; SI-NEXT: v_readlane_b32 s45, v61, 7 +; SI-NEXT: v_readlane_b32 s46, v61, 4 ; SI-NEXT: v_readlane_b32 s47, v61, 5 +; SI-NEXT: v_readlane_b32 s56, v61, 2 ; SI-NEXT: v_readlane_b32 s57, v61, 3 +; SI-NEXT: v_readlane_b32 vcc_lo, v61, 0 ; SI-NEXT: v_readlane_b32 vcc_hi, v61, 1 ; SI-NEXT: .LBB37_5: ; %end ; SI-NEXT: s_waitcnt vmcnt(14) @@ -81058,8 +81058,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 0 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: v_readlane_b32 s19, v23, 1 ; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: v_readlane_b32 s18, v23, 2 ; SI-NEXT: v_readlane_b32 s19, v23, 3 @@ -81092,8 +81092,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s17, v23, 7 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v23, 8 ; SI-NEXT: v_readlane_b32 s17, v23, 9 @@ -81126,8 +81126,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 12 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v23, 13 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v23, 14 ; SI-NEXT: v_readlane_b32 s15, v23, 15 @@ -81160,8 +81160,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 18 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: v_readlane_b32 s13, v23, 19 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v23, 20 ; SI-NEXT: v_readlane_b32 s13, v23, 21 @@ -81194,8 +81194,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_readlane_b32 s11, v23, 25 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v23, 26 ; SI-NEXT: v_readlane_b32 s11, v23, 27 @@ -81228,8 +81228,8 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 30 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v23, 31 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v23, 32 ; SI-NEXT: v_readlane_b32 s9, v23, 33 @@ -81347,8 +81347,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v23, s54, 0 ; SI-NEXT: ; implicit-def: $sgpr26 @@ -81357,172 +81355,174 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s54, 2 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v23, s55, 3 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: v_writelane_b32 v23, s54, 4 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s55, 5 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 5 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 6 -; SI-NEXT: v_writelane_b32 v23, s55, 7 ; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s54, 6 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 7 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 8 -; SI-NEXT: v_writelane_b32 v23, s55, 9 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 9 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 10 -; SI-NEXT: v_writelane_b32 v23, s55, 11 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 11 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 12 -; SI-NEXT: v_writelane_b32 v23, s55, 13 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 12 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 13 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 14 -; SI-NEXT: v_writelane_b32 v23, s55, 15 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 14 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 15 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 16 -; SI-NEXT: v_writelane_b32 v23, s55, 17 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 16 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 17 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 18 -; SI-NEXT: v_writelane_b32 v23, s55, 19 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 18 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 19 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 20 -; SI-NEXT: v_writelane_b32 v23, s55, 21 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 20 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 21 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 22 -; SI-NEXT: v_writelane_b32 v23, s55, 23 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 22 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 23 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 24 -; SI-NEXT: v_writelane_b32 v23, s55, 25 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 24 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 25 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 26 -; SI-NEXT: v_writelane_b32 v23, s55, 27 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 27 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 28 -; SI-NEXT: v_writelane_b32 v23, s55, 29 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 28 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 29 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 30 -; SI-NEXT: v_writelane_b32 v23, s55, 31 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 30 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 31 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 32 -; SI-NEXT: v_writelane_b32 v23, s55, 33 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 32 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 33 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 34 -; SI-NEXT: v_writelane_b32 v23, s55, 35 -; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 34 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 35 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v23, s54, 36 -; SI-NEXT: v_writelane_b32 v23, s55, 37 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: v_writelane_b32 v23, s54, 36 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v23, s55, 37 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: v_writelane_b32 v23, s54, 38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: v_writelane_b32 v23, s55, 39 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: @@ -83473,10 +83473,9 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: .LBB57_4: ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 -; GFX9-NEXT: ; implicit-def: $sgpr83 -; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 +; GFX9-NEXT: ; implicit-def: $sgpr82 ; GFX9-NEXT: v_writelane_b32 v22, s82, 0 ; GFX9-NEXT: ; implicit-def: $sgpr27 ; GFX9-NEXT: ; kill: killed $sgpr27 @@ -83509,6 +83508,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX9-NEXT: ; implicit-def: $sgpr85 ; GFX9-NEXT: ; implicit-def: $sgpr84 ; GFX9-NEXT: ; implicit-def: $sgpr81 +; GFX9-NEXT: ; implicit-def: $sgpr83 ; GFX9-NEXT: ; implicit-def: $sgpr36 ; GFX9-NEXT: ; implicit-def: $sgpr34 ; GFX9-NEXT: ; implicit-def: $sgpr30 @@ -83861,12 +83861,10 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 ; GFX11-NEXT: s_branch .LBB57_3 ; GFX11-NEXT: .LBB57_2: -; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: ; implicit-def: $vcc_lo -; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: s_mov_b32 s101, -1 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 0 +; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -83877,7 +83875,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 1 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83889,6 +83886,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 2 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83900,7 +83898,6 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 3 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83912,6 +83909,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 4 +; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -83920,10 +83918,12 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 +; GFX11-NEXT: s_mov_b32 s101, -1 +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 +; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 5 -; GFX11-NEXT: ; implicit-def: $vcc_lo ; GFX11-NEXT: ; implicit-def: $sgpr45 ; GFX11-NEXT: ; implicit-def: $sgpr44 ; GFX11-NEXT: ; implicit-def: $sgpr30 @@ -83987,17 +83987,17 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr88 ; GFX11-NEXT: ; implicit-def: $sgpr76 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 6 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 7 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 8 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 9 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 10 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 11 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 12 -; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 13 ; GFX11-NEXT: v_writelane_b32 v37, vcc_lo, 14 ; GFX11-NEXT: v_writelane_b32 v37, vcc_hi, 15 ; GFX11-NEXT: .LBB57_3: ; %Flow @@ -115371,8 +115371,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 0 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115380,37 +115378,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 2 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115418,8 +115385,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 4 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115427,8 +115392,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 6 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115436,8 +115399,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 8 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115445,8 +115406,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 10 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115454,8 +115413,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 12 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115463,8 +115420,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 14 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115472,8 +115427,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 16 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115481,8 +115434,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 18 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115490,8 +115441,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 20 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115499,8 +115448,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 22 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 @@ -115508,44 +115455,97 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s48, 24 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s49, 25 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; kill: killed $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: v_writelane_b32 v62, s48, 26 -; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: v_writelane_b32 v62, s49, 27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v62, s48, 28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 29 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v62, s48, 30 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s48, 28 -; SI-NEXT: v_writelane_b32 v62, s49, 29 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 31 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: v_writelane_b32 v62, s48, 32 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s48, 30 -; SI-NEXT: v_writelane_b32 v62, s49, 31 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; kill: killed $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: v_writelane_b32 v62, s48, 32 -; SI-NEXT: v_writelane_b32 v62, s49, 33 ; SI-NEXT: ; kill: killed $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: ; SI-NEXT: v_mov_b32_e32 v1, s38 @@ -115601,9 +115601,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v16, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 30 ; SI-NEXT: v_readlane_b32 s27, v62, 31 +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_mov_b32_e32 v51, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 -; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: v_mov_b32_e32 v38, s72 ; SI-NEXT: v_mov_b32_e32 v49, s62 @@ -164616,8 +164616,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshr_b32 s13, s4, 16 ; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: v_writelane_b32 v61, s4, 26 -; SI-NEXT: v_writelane_b32 v61, s5, 27 ; SI-NEXT: v_readfirstlane_b32 s4, v46 +; SI-NEXT: v_writelane_b32 v61, s5, 27 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16 @@ -164811,191 +164811,92 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v59, v51 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v46 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v22 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v7, v37 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v54, v59 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr21 -; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: v_mov_b32_e32 v44, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v31, v46 +; SI-NEXT: v_mov_b32_e32 v34, v22 +; SI-NEXT: v_mov_b32_e32 v22, v24 +; SI-NEXT: v_mov_b32_e32 v7, v37 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v52, v17 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v43, v20 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v42, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v5 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v60, v35 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v35, v6 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v12 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v19, v39 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v39, v25 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v29 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v20, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v6, v55 ; SI-NEXT: v_writelane_b32 v62, s4, 44 ; SI-NEXT: v_writelane_b32 v62, s5, 45 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v8 ; SI-NEXT: v_writelane_b32 v62, s4, 46 ; SI-NEXT: v_writelane_b32 v62, s5, 47 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v29, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 48 ; SI-NEXT: v_writelane_b32 v62, s5, 49 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr96 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr86 -; SI-NEXT: ; implicit-def: $sgpr84 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr98 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr23 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr19 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: v_writelane_b32 v62, s4, 50 ; SI-NEXT: v_writelane_b32 v62, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -165015,7 +164916,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_writelane_b32 v62, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v62, s4, 62 -; SI-NEXT: v_writelane_b32 v62, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 0 ; SI-NEXT: v_writelane_b32 v61, s5, 1 @@ -165055,18 +164955,118 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: v_writelane_b32 v61, s5, 25 -; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: v_writelane_b32 v61, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr5 ; SI-NEXT: v_writelane_b32 v61, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr20 ; SI-NEXT: v_writelane_b32 v61, s20, 28 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: v_writelane_b32 v61, s21, 29 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v61, s20, 30 ; SI-NEXT: v_writelane_b32 v61, s21, 31 +; SI-NEXT: v_mov_b32_e32 v44, v1 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: v_writelane_b32 v61, s88, 32 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v52, v17 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v42, v32 +; SI-NEXT: v_mov_b32_e32 v41, v5 +; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: v_writelane_b32 v61, s89, 33 +; SI-NEXT: v_mov_b32_e32 v60, v35 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: v_mov_b32_e32 v35, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v30, v12 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v39, v25 +; SI-NEXT: v_mov_b32_e32 v12, v29 +; SI-NEXT: v_mov_b32_e32 v20, v2 +; SI-NEXT: v_mov_b32_e32 v6, v55 +; SI-NEXT: v_mov_b32_e32 v17, v8 +; SI-NEXT: v_mov_b32_e32 v29, v33 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: .LBB91_3: ; %Flow ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -165091,30 +165091,29 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 ; SI-NEXT: v_readfirstlane_b32 s6, v9 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 16 -; SI-NEXT: s_lshr_b32 s9, s6, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v8 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v54 -; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 -; SI-NEXT: v_writelane_b32 v61, s6, 26 ; SI-NEXT: s_lshr_b64 s[20:21], s[8:9], 16 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_readfirstlane_b32 s8, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 -; SI-NEXT: v_writelane_b32 v61, s7, 27 +; SI-NEXT: v_writelane_b32 v61, s6, 26 ; SI-NEXT: v_readfirstlane_b32 s6, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: s_mov_b32 s7, s9 ; SI-NEXT: s_lshr_b32 s9, s6, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v3 @@ -165135,6 +165134,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_writelane_b32 v61, s7, 27 ; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 ; SI-NEXT: s_mov_b32 s17, s26 ; SI-NEXT: s_mov_b32 s11, s20 @@ -165585,8 +165585,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: .LBB91_5: ; %end ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v62, 0 -; SI-NEXT: v_readlane_b32 s9, v62, 1 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 1 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 2 ; SI-NEXT: v_readlane_b32 s9, v62, 3 @@ -165613,10 +165613,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s86, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_readlane_b32 s9, v62, 7 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 8 ; SI-NEXT: v_readlane_b32 s9, v62, 9 @@ -165644,8 +165644,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 12 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s80, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 13 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 13 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 14 ; SI-NEXT: v_readlane_b32 s9, v62, 15 @@ -165674,9 +165674,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 18 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 19 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: v_readlane_b32 s9, v62, 19 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 20 ; SI-NEXT: v_readlane_b32 s9, v62, 21 @@ -165705,9 +165704,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 24 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s52, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 25 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: v_readlane_b32 s9, v62, 25 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 26 ; SI-NEXT: v_readlane_b32 s9, v62, 27 @@ -165736,9 +165734,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 30 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s30, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 31 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: v_readlane_b32 s9, v62, 31 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 32 ; SI-NEXT: v_readlane_b32 s9, v62, 33 @@ -165767,8 +165764,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 36 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 37 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 37 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 38 ; SI-NEXT: v_readlane_b32 s9, v62, 39 @@ -165797,9 +165794,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 42 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s92, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 43 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: v_readlane_b32 s9, v62, 43 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 44 ; SI-NEXT: v_readlane_b32 s9, v62, 45 @@ -165828,8 +165824,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 48 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 49 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v62, 49 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 50 ; SI-NEXT: v_readlane_b32 s9, v62, 51 @@ -165858,9 +165854,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 54 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 55 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: v_readlane_b32 s9, v62, 55 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 56 ; SI-NEXT: v_readlane_b32 s9, v62, 57 @@ -165889,9 +165884,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v62, 60 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 61 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: v_readlane_b32 s9, v62, 61 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v62, 62 ; SI-NEXT: v_readlane_b32 s9, v62, 63 @@ -165920,9 +165914,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 2 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 3 ; SI-NEXT: s_lshl_b32 s8, s8, 8 -; SI-NEXT: v_readlane_b32 s27, v61, 1 +; SI-NEXT: v_readlane_b32 s9, v61, 3 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 4 ; SI-NEXT: v_readlane_b32 s9, v61, 5 @@ -165951,8 +165944,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 8 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s24, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 9 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v61, 9 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 10 ; SI-NEXT: v_readlane_b32 s9, v61, 11 @@ -165981,8 +165974,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 14 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s16, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 15 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v61, 15 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 16 ; SI-NEXT: v_readlane_b32 s9, v61, 17 @@ -166007,14 +166000,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s8, s19, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_readlane_b32 s9, v61, 21 +; SI-NEXT: v_readlane_b32 s9, v61, 23 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s61, v62, 17 +; SI-NEXT: v_readlane_b32 s27, v62, 53 +; SI-NEXT: v_readlane_b32 s61, v62, 23 +; SI-NEXT: v_readlane_b32 s27, v62, 59 +; SI-NEXT: v_readlane_b32 s61, v62, 29 +; SI-NEXT: v_readlane_b32 s43, v62, 41 +; SI-NEXT: v_readlane_b32 s27, v61, 1 ; SI-NEXT: v_readlane_b32 s61, v62, 35 ; SI-NEXT: v_readlane_b32 s43, v62, 47 ; SI-NEXT: v_readlane_b32 s27, v61, 7 ; SI-NEXT: v_readlane_b32 s21, v61, 13 ; SI-NEXT: v_readlane_b32 s17, v61, 19 +; SI-NEXT: v_readlane_b32 s11, v61, 25 ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -166057,11 +166060,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_readlane_b32 s8, v61, 20 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_and_b32 s5, s10, 0xff -; SI-NEXT: v_readlane_b32 s9, v61, 21 ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: v_readlane_b32 s8, v61, 22 -; SI-NEXT: v_readlane_b32 s9, v61, 23 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: v_readlane_b32 s10, v61, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -166085,16 +166086,16 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_readlane_b32 s11, v61, 25 +; SI-NEXT: v_readlane_b32 s9, v61, 29 +; SI-NEXT: v_readlane_b32 s9, v61, 31 +; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s8, v1 ; SI-NEXT: v_readlane_b32 s8, v61, 28 -; SI-NEXT: v_readlane_b32 s9, v61, 29 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 ; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_readlane_b32 s8, v61, 30 -; SI-NEXT: v_readlane_b32 s9, v61, 31 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s8, 0xff ; SI-NEXT: v_readlane_b32 s8, v61, 32 @@ -166116,7 +166117,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_readlane_b32 s9, v61, 33 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; SI-NEXT: v_or_b32_e32 v1, s5, v1 @@ -167707,7 +167707,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: s_and_b32 s5, s6, 0xff ; VI-NEXT: v_readlane_b32 s6, v22, 49 -; VI-NEXT: v_readlane_b32 s9, v22, 5 ; VI-NEXT: s_lshl_b32 s6, s6, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: v_readlane_b32 s6, v22, 48 @@ -167764,6 +167763,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: s_lshl_b32 s5, s5, 16 ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v0 ; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_readlane_b32 s9, v22, 5 ; VI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -167814,42 +167814,41 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: .LBB91_4: ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 -; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: ; implicit-def: $sgpr60 -; VI-NEXT: ; implicit-def: $sgpr63 -; VI-NEXT: ; implicit-def: $sgpr62 -; VI-NEXT: ; implicit-def: $sgpr73 -; VI-NEXT: ; implicit-def: $sgpr72 -; VI-NEXT: ; implicit-def: $sgpr75 -; VI-NEXT: ; implicit-def: $sgpr74 -; VI-NEXT: ; implicit-def: $sgpr77 -; VI-NEXT: ; implicit-def: $sgpr76 -; VI-NEXT: ; implicit-def: $sgpr46 -; VI-NEXT: ; kill: killed $sgpr46 ; VI-NEXT: v_writelane_b32 v22, s60, 0 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr61 ; VI-NEXT: v_writelane_b32 v22, s61, 1 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr62 ; VI-NEXT: v_writelane_b32 v22, s62, 2 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr63 ; VI-NEXT: v_writelane_b32 v22, s63, 3 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr72 ; VI-NEXT: v_writelane_b32 v22, s72, 4 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr73 ; VI-NEXT: v_writelane_b32 v22, s73, 5 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr74 ; VI-NEXT: v_writelane_b32 v22, s74, 6 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr75 ; VI-NEXT: v_writelane_b32 v22, s75, 7 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr46 +; VI-NEXT: ; kill: killed $sgpr46 +; VI-NEXT: ; implicit-def: $sgpr76 ; VI-NEXT: v_writelane_b32 v22, s76, 8 ; VI-NEXT: ; implicit-def: $sgpr46 ; VI-NEXT: ; kill: killed $sgpr46 @@ -167863,6 +167862,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; VI-NEXT: ; implicit-def: $sgpr47 ; VI-NEXT: ; implicit-def: $sgpr65 ; VI-NEXT: ; implicit-def: $sgpr53 +; VI-NEXT: ; implicit-def: $sgpr77 ; VI-NEXT: ; implicit-def: $sgpr59 ; VI-NEXT: ; implicit-def: $sgpr66 ; VI-NEXT: ; implicit-def: $sgpr78 @@ -169890,11 +169890,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: .LBB91_2: ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr46 ; GFX11-NEXT: ; kill: killed $sgpr46 -; GFX11-NEXT: ; implicit-def: $vcc_hi +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: ; implicit-def: $vcc_hi ; GFX11-NEXT: s_mov_b32 s104, -1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 @@ -169997,7 +169997,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v20, s42, 0 -; GFX11-NEXT: v_writelane_b32 v20, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -170007,14 +170006,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s46, 2 +; GFX11-NEXT: v_writelane_b32 v20, s43, 1 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 -; GFX11-NEXT: v_writelane_b32 v20, s47, 3 +; GFX11-NEXT: v_writelane_b32 v20, s46, 2 ; GFX11-NEXT: ; implicit-def: $sgpr46 +; GFX11-NEXT: v_writelane_b32 v20, s47, 3 ; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 4 -; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 ; GFX11-NEXT: ; implicit-def: $vcc_lo +; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 5 ; GFX11-NEXT: v_writelane_b32 v20, vcc_lo, 6 ; GFX11-NEXT: v_writelane_b32 v20, vcc_hi, 7 ; GFX11-NEXT: .LBB91_3: ; %Flow @@ -170854,82 +170854,82 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_lshr_b64 s[74:75], s[74:75], 24 ; GFX11-NEXT: s_lshr_b32 s75, s42, 8 ; GFX11-NEXT: v_writelane_b32 v20, s58, 0 +; GFX11-NEXT: s_lshr_b32 s58, s63, 24 ; GFX11-NEXT: s_lshr_b32 s26, s26, 16 ; GFX11-NEXT: s_lshr_b32 s65, s73, 24 ; GFX11-NEXT: s_pack_ll_b32_b16 s90, s26, s90 -; GFX11-NEXT: s_lshr_b32 s82, s73, 8 ; GFX11-NEXT: v_writelane_b32 v20, s59, 1 -; GFX11-NEXT: s_lshr_b32 s58, s63, 24 ; GFX11-NEXT: s_lshr_b32 s59, s63, 8 ; GFX11-NEXT: s_lshr_b64 s[62:63], s[62:63], 24 ; GFX11-NEXT: s_lshr_b32 s63, s93, 24 -; GFX11-NEXT: s_lshr_b32 s84, s72, 16 +; GFX11-NEXT: s_lshr_b32 s82, s73, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 21 ; GFX11-NEXT: s_lshr_b32 s63, s93, 8 +; GFX11-NEXT: s_lshr_b32 s84, s72, 16 ; GFX11-NEXT: s_lshr_b32 s51, s72, 8 ; GFX11-NEXT: s_lshr_b64 s[72:73], s[72:73], 24 -; GFX11-NEXT: s_lshr_b32 s86, s77, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 22 ; GFX11-NEXT: s_lshr_b32 s63, s92, 16 +; GFX11-NEXT: s_lshr_b32 s86, s77, 24 ; GFX11-NEXT: s_lshr_b32 s87, s77, 8 ; GFX11-NEXT: s_lshr_b32 s52, s76, 16 -; GFX11-NEXT: s_lshr_b32 s100, s76, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 23 ; GFX11-NEXT: s_lshr_b32 s63, s92, 8 +; GFX11-NEXT: s_lshr_b32 s100, s76, 8 ; GFX11-NEXT: s_lshr_b64 s[76:77], s[76:77], 24 ; GFX11-NEXT: s_lshr_b32 s101, s89, 8 -; GFX11-NEXT: s_lshr_b32 s98, s79, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 24 ; GFX11-NEXT: s_lshr_b32 s63, s95, 24 +; GFX11-NEXT: s_lshr_b32 s98, s79, 24 ; GFX11-NEXT: s_lshr_b32 s99, s79, 8 ; GFX11-NEXT: s_lshr_b32 s53, s78, 16 -; GFX11-NEXT: s_lshr_b32 s97, s78, 8 ; GFX11-NEXT: v_writelane_b32 v20, s63, 25 ; GFX11-NEXT: s_lshr_b32 s63, s95, 8 +; GFX11-NEXT: s_lshr_b32 s97, s78, 8 ; GFX11-NEXT: s_lshr_b64 s[78:79], s[78:79], 24 ; GFX11-NEXT: s_lshr_b64 s[92:93], s[92:93], 24 -; GFX11-NEXT: s_lshr_b32 s102, s94, 16 ; GFX11-NEXT: v_writelane_b32 v20, s63, 26 ; GFX11-NEXT: s_lshr_b32 s63, s43, 24 +; GFX11-NEXT: s_lshr_b32 s102, s94, 16 ; GFX11-NEXT: s_lshr_b32 s103, s94, 8 ; GFX11-NEXT: s_lshr_b64 s[94:95], s[94:95], 24 -; GFX11-NEXT: s_lshr_b32 s73, s91, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 27 ; GFX11-NEXT: s_lshr_b32 s63, s43, 8 +; GFX11-NEXT: s_lshr_b32 s73, s91, 24 ; GFX11-NEXT: s_lshr_b32 s77, s91, 8 ; GFX11-NEXT: s_lshr_b32 s83, s90, 8 -; GFX11-NEXT: s_lshr_b32 s66, s37, 24 ; GFX11-NEXT: v_writelane_b32 v20, s63, 28 ; GFX11-NEXT: s_lshr_b32 s63, s42, 16 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[42:43], 24 +; GFX11-NEXT: s_lshr_b32 s66, s37, 24 ; GFX11-NEXT: s_lshr_b32 s67, s37, 8 -; GFX11-NEXT: s_lshr_b32 s68, s36, 16 ; GFX11-NEXT: v_writelane_b32 v20, s42, 6 +; GFX11-NEXT: s_lshr_b32 s68, s36, 16 ; GFX11-NEXT: s_lshr_b32 s49, s36, 8 ; GFX11-NEXT: s_lshr_b32 s69, s35, 24 ; GFX11-NEXT: s_lshr_b32 s70, s35, 8 -; GFX11-NEXT: s_lshr_b32 s64, s34, 16 ; GFX11-NEXT: v_writelane_b32 v20, s43, 7 ; GFX11-NEXT: s_lshr_b64 s[42:43], s[44:45], 24 +; GFX11-NEXT: s_lshr_b32 s64, s34, 16 ; GFX11-NEXT: s_lshr_b32 s80, s34, 8 ; GFX11-NEXT: s_lshr_b32 s79, s45, 24 -; GFX11-NEXT: s_lshr_b32 s93, s45, 8 ; GFX11-NEXT: v_writelane_b32 v20, s42, 4 +; GFX11-NEXT: s_lshr_b32 s42, s89, 24 +; GFX11-NEXT: s_lshr_b32 s93, s45, 8 ; GFX11-NEXT: s_lshr_b32 s95, s44, 16 ; GFX11-NEXT: s_lshr_b32 vcc_hi, s44, 8 -; GFX11-NEXT: s_lshr_b32 s34, s47, 24 -; GFX11-NEXT: s_lshr_b32 s55, s47, 8 ; GFX11-NEXT: v_writelane_b32 v20, s43, 5 ; GFX11-NEXT: s_lshr_b32 s43, s88, 16 -; GFX11-NEXT: s_lshr_b32 s42, s89, 24 +; GFX11-NEXT: s_lshr_b32 s34, s47, 24 +; GFX11-NEXT: s_lshr_b32 s55, s47, 8 ; GFX11-NEXT: s_lshr_b32 s35, s46, 16 -; GFX11-NEXT: s_lshr_b32 s36, s46, 8 ; GFX11-NEXT: v_writelane_b32 v20, s43, 29 ; GFX11-NEXT: s_lshr_b32 s43, s88, 8 ; GFX11-NEXT: s_lshr_b64 s[88:89], s[88:89], 24 ; GFX11-NEXT: s_lshr_b32 s89, s90, 16 ; GFX11-NEXT: s_lshr_b64 s[90:91], s[90:91], 24 ; GFX11-NEXT: v_writelane_b32 v20, s43, 30 +; GFX11-NEXT: s_lshr_b32 s36, s46, 8 ; GFX11-NEXT: s_lshr_b64 s[46:47], s[46:47], 24 ; GFX11-NEXT: s_lshr_b32 s37, s57, 24 ; GFX11-NEXT: s_lshr_b32 s38, s57, 8 @@ -171042,9 +171042,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_lshl_b32 s19, s73, 8 ; GFX11-NEXT: s_or_b32 s16, s16, s17 -; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: v_readlane_b32 s17, v20, 1 ; GFX11-NEXT: s_lshl_b32 s16, s16, 16 -; GFX11-NEXT: v_readlane_b32 s81, v18, 25 +; GFX11-NEXT: s_lshl_b32 s17, s70, 8 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v7, s2 :: v_dual_mov_b32 v8, s3 @@ -171056,14 +171056,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: v_readlane_b32 s16, v20, 0 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s21, 0xff -; GFX11-NEXT: v_readlane_b32 s17, v20, 1 -; GFX11-NEXT: s_or_b32 s1, s1, s2 -; GFX11-NEXT: v_readlane_b32 s2, v20, 18 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffff -; GFX11-NEXT: s_lshl_b32 s17, s70, 8 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: v_readlane_b32 s2, v20, 18 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: v_readlane_b32 s96, v19, 0 +; GFX11-NEXT: v_readlane_b32 s81, v18, 25 ; GFX11-NEXT: v_readlane_b32 s70, v18, 22 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: v_readlane_b32 s69, v18, 21 @@ -171092,22 +171092,21 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s17, s17, s18 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff ; GFX11-NEXT: s_lshl_b32 s16, s17, 16 -; GFX11-NEXT: s_and_b32 s18, s71, 0xff +; GFX11-NEXT: v_readlane_b32 s17, v20, 3 ; GFX11-NEXT: s_or_b32 s3, s3, s16 ; GFX11-NEXT: v_readlane_b32 s16, v20, 2 ; GFX11-NEXT: v_dual_mov_b32 v11, s2 :: v_dual_mov_b32 v12, s3 ; GFX11-NEXT: s_and_b32 s2, s68, 0xff -; GFX11-NEXT: v_readlane_b32 s17, v20, 3 +; GFX11-NEXT: s_lshl_b32 s17, s66, 8 ; GFX11-NEXT: s_lshl_b32 s3, s16, 8 ; GFX11-NEXT: v_readlane_b32 s16, v20, 20 ; GFX11-NEXT: s_or_b32 s1, s2, s3 ; GFX11-NEXT: s_and_b32 s2, s25, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s67, 8 -; GFX11-NEXT: s_lshl_b32 s17, s66, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_or_b32 s3, s16, s17 -; GFX11-NEXT: s_lshl_b32 s1, s1, 16 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -171120,9 +171119,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; GFX11-NEXT: s_or_b32 s3, s16, s17 ; GFX11-NEXT: s_and_b32 s16, s27, 0xff ; GFX11-NEXT: s_lshl_b32 s17, s77, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_and_b32 s18, s71, 0xff ; GFX11-NEXT: s_or_b32 s16, s16, s17 ; GFX11-NEXT: s_or_b32 s17, s18, s19 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_lshl_b32 s3, s3, 16 ; GFX11-NEXT: s_and_b32 s16, s16, 0xffff ; GFX11-NEXT: s_lshl_b32 s17, s17, 16 @@ -188515,8 +188515,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 0 -; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v37 ; SI-NEXT: s_or_b32 s42, s5, s4 @@ -188550,8 +188550,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v20 ; SI-NEXT: s_or_b32 s28, s5, s4 @@ -188631,8 +188631,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 30 -; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v29 ; SI-NEXT: s_or_b32 s22, s5, s4 @@ -188648,8 +188648,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 ; SI-NEXT: v_writelane_b32 v62, s4, 36 -; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s5, v13 ; SI-NEXT: s_or_b32 s20, s5, s4 @@ -188814,97 +188814,104 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: v_mov_b32_e32 v51, v42 ; SI-NEXT: v_writelane_b32 v62, s4, 0 ; SI-NEXT: v_writelane_b32 v62, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v41, v21 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: v_writelane_b32 v62, s5, 3 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v21, v24 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: v_writelane_b32 v62, s5, 5 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v40, v34 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: v_writelane_b32 v62, s5, 7 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v34, v61 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: v_writelane_b32 v62, s5, 9 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v13, v12 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: v_writelane_b32 v62, s5, 11 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v12, v48 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: v_writelane_b32 v62, s5, 13 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v57, v30 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_writelane_b32 v62, s5, 15 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v58, v11 ; SI-NEXT: v_writelane_b32 v62, s4, 16 ; SI-NEXT: v_writelane_b32 v62, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v56, v47 ; SI-NEXT: v_writelane_b32 v62, s4, 18 ; SI-NEXT: v_writelane_b32 v62, s5, 19 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 20 ; SI-NEXT: v_writelane_b32 v62, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v30, v32 ; SI-NEXT: v_writelane_b32 v62, s4, 22 ; SI-NEXT: v_writelane_b32 v62, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: v_writelane_b32 v62, s4, 24 ; SI-NEXT: v_writelane_b32 v62, s5, 25 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v31, v10 ; SI-NEXT: v_writelane_b32 v62, s4, 26 ; SI-NEXT: v_writelane_b32 v62, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v54, v9 ; SI-NEXT: v_writelane_b32 v62, s4, 28 ; SI-NEXT: v_writelane_b32 v62, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v55, v4 ; SI-NEXT: v_writelane_b32 v62, s4, 30 ; SI-NEXT: v_writelane_b32 v62, s5, 31 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 32 ; SI-NEXT: v_writelane_b32 v62, s5, 33 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_writelane_b32 v62, s4, 34 ; SI-NEXT: v_writelane_b32 v62, s5, 35 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v17, v43 ; SI-NEXT: v_writelane_b32 v62, s4, 36 ; SI-NEXT: v_writelane_b32 v62, s5, 37 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: s_mov_b64 vcc, -1 ; SI-NEXT: v_writelane_b32 v62, s4, 38 ; SI-NEXT: v_writelane_b32 v62, s5, 39 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v25, v59 ; SI-NEXT: v_writelane_b32 v62, s4, 40 ; SI-NEXT: v_writelane_b32 v62, s5, 41 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_mov_b32_e32 v1, v52 ; SI-NEXT: v_writelane_b32 v62, s4, 42 ; SI-NEXT: v_writelane_b32 v62, s5, 43 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: v_writelane_b32 v62, s80, 46 +; SI-NEXT: v_writelane_b32 v62, s81, 47 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: v_writelane_b32 v62, s80, 48 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_mov_b32_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v57, v30 +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v31, v10 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: v_writelane_b32 v62, s81, 49 +; SI-NEXT: v_mov_b32_e32 v25, v59 +; SI-NEXT: v_mov_b32_e32 v1, v52 ; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $vgpr24 @@ -188974,14 +188981,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr84 ; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: v_writelane_b32 v62, s4, 44 -; SI-NEXT: v_writelane_b32 v62, s5, 45 -; SI-NEXT: v_writelane_b32 v62, s80, 46 -; SI-NEXT: v_writelane_b32 v62, s81, 47 -; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v62, s80, 48 -; SI-NEXT: v_writelane_b32 v62, s81, 49 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: .LBB95_3: ; %Flow ; SI-NEXT: v_mov_b32_e32 v14, v17 @@ -189515,11 +189515,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_lshr_b32 s71, s45, 8 ; SI-NEXT: .LBB95_5: ; %end ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 ; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 -; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 ; SI-NEXT: s_and_b32 s44, s44, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 ; SI-NEXT: s_or_b32 s44, s44, s47 ; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff ; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 @@ -189540,10 +189538,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v13, v21, v13 ; SI-NEXT: v_or_b32_e32 v13, s44, v13 ; SI-NEXT: v_readlane_b32 s44, v62, 6 -; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_lshl_b32 s44, s44, 8 ; SI-NEXT: s_and_b32 s42, s42, 0xff -; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 +; SI-NEXT: v_readlane_b32 s45, v62, 7 ; SI-NEXT: s_or_b32 s42, s42, s44 ; SI-NEXT: v_readlane_b32 s44, v62, 8 ; SI-NEXT: v_readlane_b32 s45, v62, 9 @@ -189565,9 +189562,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v12, v23, v12 ; SI-NEXT: v_or_b32_e32 v12, s42, v12 ; SI-NEXT: v_readlane_b32 s42, v62, 12 -; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_lshl_b32 s42, s42, 8 ; SI-NEXT: s_and_b32 s40, s40, 0xff +; SI-NEXT: v_readlane_b32 s43, v62, 13 ; SI-NEXT: s_or_b32 s40, s40, s42 ; SI-NEXT: v_readlane_b32 s42, v62, 14 ; SI-NEXT: v_readlane_b32 s43, v62, 15 @@ -189589,9 +189586,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v11, v24, v11 ; SI-NEXT: v_or_b32_e32 v11, s40, v11 ; SI-NEXT: v_readlane_b32 s40, v62, 18 -; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_lshl_b32 s40, s40, 8 ; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: v_readlane_b32 s41, v62, 19 ; SI-NEXT: s_or_b32 s28, s28, s40 ; SI-NEXT: v_readlane_b32 s40, v62, 20 ; SI-NEXT: v_readlane_b32 s41, v62, 21 @@ -189613,9 +189610,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v25, v26, v25 ; SI-NEXT: v_or_b32_e32 v25, s28, v25 ; SI-NEXT: v_readlane_b32 s28, v62, 24 -; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_lshl_b32 s28, s28, 8 ; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: v_readlane_b32 s29, v62, 25 ; SI-NEXT: s_or_b32 s26, s26, s28 ; SI-NEXT: v_readlane_b32 s28, v62, 26 ; SI-NEXT: v_readlane_b32 s29, v62, 27 @@ -189637,14 +189634,17 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_or_b32_e32 v27, s26, v27 ; SI-NEXT: v_readlane_b32 s26, v62, 30 -; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_lshl_b32 s26, s26, 8 ; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 +; SI-NEXT: v_readlane_b32 s27, v62, 31 ; SI-NEXT: s_or_b32 s24, s24, s26 ; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 ; SI-NEXT: v_readlane_b32 s27, v62, 33 ; SI-NEXT: s_and_b32 s26, s26, 0xff ; SI-NEXT: v_readlane_b32 s28, v62, 34 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 ; SI-NEXT: s_lshl_b32 s27, s28, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 @@ -189678,11 +189678,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: v_readlane_b32 s25, v62, 37 ; SI-NEXT: s_or_b32 s22, s22, s24 ; SI-NEXT: v_readlane_b32 s24, v62, 38 ; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen @@ -193194,7 +193194,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -193205,8 +193204,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -193269,7 +193270,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB95_2 ; GFX11-NEXT: .LBB95_4: @@ -193352,9 +193352,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v78, 18 ; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 ; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v42, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 ; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 @@ -193414,10 +193415,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB95_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 @@ -211589,7 +211589,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s17, 23 ; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 ; SI-NEXT: v_writelane_b32 v41, s16, 20 -; SI-NEXT: v_writelane_b32 v41, s17, 21 ; SI-NEXT: s_lshr_b32 s16, s61, 24 ; SI-NEXT: v_writelane_b32 v43, s16, 18 ; SI-NEXT: s_lshr_b32 s16, s61, 16 @@ -211691,6 +211690,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s16, s5, 16 ; SI-NEXT: v_writelane_b32 v42, s16, 0 ; SI-NEXT: s_lshr_b32 s16, s5, 8 +; SI-NEXT: v_writelane_b32 v41, s17, 21 ; SI-NEXT: v_writelane_b32 v42, s16, 1 ; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 ; SI-NEXT: v_writelane_b32 v41, s16, 28 @@ -211833,7 +211833,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s17, v41, 25 ; SI-NEXT: s_lshl_b32 s17, s16, 8 ; SI-NEXT: s_and_b32 s18, s56, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 23 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_readlane_b32 s18, v41, 26 ; SI-NEXT: v_readlane_b32 s19, v41, 27 @@ -211975,7 +211974,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v14, s17 ; SI-NEXT: s_lshl_b32 s17, s90, 8 ; SI-NEXT: s_and_b32 s18, s46, 0xff -; SI-NEXT: v_readlane_b32 s21, v41, 29 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: v_readlane_b32 s18, v41, 30 ; SI-NEXT: v_readlane_b32 s19, v41, 31 @@ -211994,82 +211992,78 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s16, 0xff ; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: s_lshl_b32 s18, s18, 16 -; SI-NEXT: s_lshl_b32 s19, s16, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; SI-NEXT: v_mov_b32_e32 v16, s17 -; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 -; SI-NEXT: v_readlane_b32 s17, v41, 35 -; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 -; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: s_and_b32 s17, s42, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 37 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: v_readlane_b32 s17, v41, 35 ; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 38 -; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 -; SI-NEXT: s_lshl_b32 s18, s18, 24 -; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 -; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s43, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 -; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 -; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 -; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 42 -; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 43 ; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 -; SI-NEXT: s_lshl_b32 s17, s17, 16 -; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 42 ; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 -; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_and_b32 s16, s16, 0xffff -; SI-NEXT: v_readlane_b32 s19, v41, 41 +; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 -; SI-NEXT: s_lshl_b32 s17, s18, 8 -; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: v_readlane_b32 s18, v41, 40 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v41, 43 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 42 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v41, 44 @@ -212100,9 +212094,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 46 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v41, 47 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v41, 45 +; SI-NEXT: v_readlane_b32 s17, v41, 47 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v41, 48 ; SI-NEXT: v_readlane_b32 s17, v41, 49 @@ -212135,8 +212128,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 52 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: v_readlane_b32 s15, v41, 53 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v41, 53 ; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: v_readlane_b32 s14, v41, 54 ; SI-NEXT: v_readlane_b32 s15, v41, 55 @@ -212169,8 +212162,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 58 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s13, v41, 59 ; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: v_readlane_b32 s13, v41, 59 ; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: v_readlane_b32 s12, v41, 60 ; SI-NEXT: v_readlane_b32 s13, v41, 61 @@ -212203,8 +212196,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 0 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: v_readlane_b32 s11, v43, 1 ; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: v_readlane_b32 s11, v43, 1 ; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: v_readlane_b32 s10, v43, 2 ; SI-NEXT: v_readlane_b32 s11, v43, 3 @@ -212237,8 +212230,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 6 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: v_readlane_b32 s9, v43, 7 ; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 7 ; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: v_readlane_b32 s8, v43, 8 ; SI-NEXT: v_readlane_b32 s9, v43, 9 @@ -212271,8 +212264,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v43, 13 ; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 14 ; SI-NEXT: v_readlane_b32 s7, v43, 15 @@ -212292,14 +212285,21 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_readlane_b32 s5, v42, 0 +; SI-NEXT: v_readlane_b32 s19, v41, 37 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: v_readlane_b32 s6, v43, 63 +; SI-NEXT: v_readlane_b32 s19, v41, 39 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_readlane_b32 s19, v41, 41 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s21, v41, 23 +; SI-NEXT: v_readlane_b32 s19, v41, 43 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s21, v41, 29 +; SI-NEXT: v_readlane_b32 s19, v41, 45 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -212357,47 +212357,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v41, s4, 20 ; SI-NEXT: v_writelane_b32 v41, s5, 21 ; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr82 -; SI-NEXT: ; implicit-def: $sgpr80 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: v_writelane_b32 v41, s4, 22 -; SI-NEXT: v_writelane_b32 v41, s5, 23 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s5, 23 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 @@ -212409,7 +212376,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s5, 27 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v41, s4, 28 -; SI-NEXT: v_writelane_b32 v41, s5, 29 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 @@ -212497,6 +212463,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s5, 29 ; SI-NEXT: ; kill: killed $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; kill: killed $sgpr4 @@ -212550,7 +212517,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s5, 61 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v41, s4, 62 -; SI-NEXT: v_writelane_b32 v41, s5, 63 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v43, s4, 0 @@ -212570,14 +212536,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s4, 10 ; SI-NEXT: v_writelane_b32 v43, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v43, s16, 12 ; SI-NEXT: v_writelane_b32 v43, s17, 13 ; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s16, 14 ; SI-NEXT: v_writelane_b32 v43, s17, 15 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: v_writelane_b32 v43, s16, 16 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: v_writelane_b32 v41, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_writelane_b32 v43, s17, 17 ; SI-NEXT: s_branch .LBB99_2 ; @@ -215550,7 +215550,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; implicit-def: $sgpr74 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: v_writelane_b32 v78, s42, 0 -; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -215561,8 +215560,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: v_writelane_b32 v78, s43, 1 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 +; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 @@ -215625,7 +215626,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: ; kill: killed $sgpr42 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: ; kill: killed $sgpr42 -; GFX11-NEXT: ; implicit-def: $sgpr43 ; GFX11-NEXT: ; implicit-def: $sgpr42 ; GFX11-NEXT: s_branch .LBB99_2 ; GFX11-NEXT: .LBB99_4: @@ -215708,9 +215708,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v78, 18 ; GFX11-NEXT: v_dual_mov_b32 v67, s58 :: v_dual_mov_b32 v26, s88 ; GFX11-NEXT: v_dual_mov_b32 v81, s44 :: v_dual_mov_b32 v30, s90 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_mov_b32_e32 v42, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 19 +; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_dual_mov_b32 v17, s74 :: v_dual_mov_b32 v34, s92 ; GFX11-NEXT: v_dual_mov_b32 v65, s94 :: v_dual_mov_b32 v68, s30 ; GFX11-NEXT: v_mov_b32_e32 v41, s0 @@ -215770,10 +215771,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; GFX11-NEXT: v_readlane_b32 s0, v77, 7 ; GFX11-NEXT: v_mov_b32_e32 v149, s0 ; GFX11-NEXT: v_readlane_b32 s0, v77, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v148, s0 ; GFX11-NEXT: v_readlane_b32 s0, v78, 0 -; GFX11-NEXT: v_readlane_b32 s1, v78, 1 ; GFX11-NEXT: v_mov_b32_e32 v82, s0 ; GFX11-NEXT: .LBB99_5: ; %end ; GFX11-NEXT: v_lshlrev_b32_e32 v69, 8, v74 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 3e2b488d02f37..9c05297f7bcae 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -66739,11 +66739,9 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s17, 13 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: v_readlane_b32 s18, v21, 0 -; SI-NEXT: v_readlane_b32 s19, v21, 1 +; SI-NEXT: s_and_b32 s16, s40, 0xff ; SI-NEXT: s_lshl_b32 s17, s18, 8 ; SI-NEXT: v_readlane_b32 s18, v21, 2 -; SI-NEXT: s_and_b32 s16, s40, 0xff -; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v21, 4 @@ -66765,9 +66763,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 6 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: v_readlane_b32 s17, v21, 7 ; SI-NEXT: s_lshl_b32 s16, s16, 8 -; SI-NEXT: v_readlane_b32 s19, v21, 5 +; SI-NEXT: v_readlane_b32 s17, v21, 7 ; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: v_readlane_b32 s16, v21, 8 ; SI-NEXT: v_readlane_b32 s17, v21, 9 @@ -66799,8 +66796,8 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 12 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: v_readlane_b32 s15, v21, 13 ; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: v_readlane_b32 s15, v21, 13 ; SI-NEXT: s_or_b32 s10, s10, s14 ; SI-NEXT: v_readlane_b32 s14, v21, 14 ; SI-NEXT: v_readlane_b32 s15, v21, 15 @@ -66951,10 +66948,13 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: s_and_b32 s5, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s91, 24 +; SI-NEXT: v_readlane_b32 s19, v21, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s19, v21, 3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s19, v21, 5 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 @@ -67009,6 +67009,28 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: v_writelane_b32 v21, s4, 0 ; SI-NEXT: v_writelane_b32 v21, s5, 1 ; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: v_writelane_b32 v21, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 4 +; SI-NEXT: v_writelane_b32 v21, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 6 +; SI-NEXT: v_writelane_b32 v21, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 8 +; SI-NEXT: v_writelane_b32 v21, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 10 +; SI-NEXT: v_writelane_b32 v21, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 12 +; SI-NEXT: v_writelane_b32 v21, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 14 +; SI-NEXT: v_writelane_b32 v21, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 16 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr74 @@ -67036,6 +67058,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: v_writelane_b32 v21, s5, 17 ; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 @@ -67052,33 +67075,10 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: v_writelane_b32 v21, s4, 2 -; SI-NEXT: v_writelane_b32 v21, s5, 3 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 4 -; SI-NEXT: v_writelane_b32 v21, s5, 5 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 6 -; SI-NEXT: v_writelane_b32 v21, s5, 7 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 8 -; SI-NEXT: v_writelane_b32 v21, s5, 9 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 10 -; SI-NEXT: v_writelane_b32 v21, s5, 11 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 12 -; SI-NEXT: v_writelane_b32 v21, s5, 13 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 14 -; SI-NEXT: v_writelane_b32 v21, s5, 15 -; SI-NEXT: ; implicit-def: $sgpr4 -; SI-NEXT: v_writelane_b32 v21, s4, 16 -; SI-NEXT: v_writelane_b32 v21, s5, 17 -; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: @@ -88402,8 +88402,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_lshr_b64 s[4:5], s[74:75], 24 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v41, s4, 0 -; SI-NEXT: v_writelane_b32 v41, s5, 1 ; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: v_writelane_b32 v41, s5, 1 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_readfirstlane_b32 s4, v7 ; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll index 37cbd2d926413..34abba10f6c61 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -150,8 +150,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, vcc_lo +; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index 5f36d5403ebcf..744871d8c84ff 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -65,12 +65,12 @@ body: | ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -141,12 +141,12 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 76f204dd0c16a..420f003d4f417 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -41,10 +41,10 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v6, s70, 20 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v6, s71, 21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: v_writelane_b32 v7, s8, 0 ; CHECK-NEXT: v_writelane_b32 v7, s9, 1 ; CHECK-NEXT: v_writelane_b32 v7, s10, 2 @@ -76,15 +76,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v7, s64, 28 ; CHECK-NEXT: v_writelane_b32 v7, s65, 29 ; CHECK-NEXT: v_writelane_b32 v7, s66, 30 +; CHECK-NEXT: v_writelane_b32 v7, s67, 31 ; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0 ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: v_writelane_b32 v7, s67, 31 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s52, v7, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_readlane_b32 s52, v7, 0 ; CHECK-NEXT: v_readlane_b32 s53, v7, 1 ; CHECK-NEXT: v_readlane_b32 s54, v7, 2 ; CHECK-NEXT: v_readlane_b32 s55, v7, 3 @@ -92,12 +91,13 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 5 ; CHECK-NEXT: v_readlane_b32 s58, v7, 6 ; CHECK-NEXT: v_readlane_b32 s59, v7, 7 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 ; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v5 ; CHECK-NEXT: v_readlane_b32 s60, v7, 8 ; CHECK-NEXT: v_readlane_b32 s61, v7, 9 -; CHECK-NEXT: v_readlane_b32 s62, v7, 10 ; CHECK-NEXT: image_sample_lz v4, v[2:3], s[52:59], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s62, v7, 10 ; CHECK-NEXT: v_readlane_b32 s63, v7, 11 ; CHECK-NEXT: v_readlane_b32 s64, v7, 12 ; CHECK-NEXT: v_readlane_b32 s65, v7, 13 @@ -109,7 +109,6 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: v_readlane_b32 s52, v7, 16 ; CHECK-NEXT: v_readlane_b32 s60, v7, 24 ; CHECK-NEXT: v_readlane_b32 s61, v7, 25 ; CHECK-NEXT: v_readlane_b32 s62, v7, 26 @@ -120,10 +119,11 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s67, v7, 31 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: v_readlane_b32 s52, v7, 16 ; CHECK-NEXT: v_readlane_b32 s53, v7, 17 ; CHECK-NEXT: v_readlane_b32 s54, v7, 18 -; CHECK-NEXT: v_readlane_b32 s55, v7, 19 ; CHECK-NEXT: image_sample_lz v3, v[2:3], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s55, v7, 19 ; CHECK-NEXT: v_readlane_b32 s56, v7, 20 ; CHECK-NEXT: v_readlane_b32 s57, v7, 21 ; CHECK-NEXT: v_readlane_b32 s58, v7, 22 @@ -152,10 +152,18 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s16, 0 ; CHECK-NEXT: s_mov_b32 s17, s16 ; CHECK-NEXT: v_mov_b32_e32 v0, s16 -; CHECK-NEXT: v_readlane_b32 s44, v7, 16 +; CHECK-NEXT: v_readlane_b32 s52, v7, 24 +; CHECK-NEXT: v_readlane_b32 s53, v7, 25 +; CHECK-NEXT: v_readlane_b32 s54, v7, 26 +; CHECK-NEXT: v_readlane_b32 s55, v7, 27 +; CHECK-NEXT: v_readlane_b32 s56, v7, 28 +; CHECK-NEXT: v_readlane_b32 s57, v7, 29 +; CHECK-NEXT: v_readlane_b32 s58, v7, 30 +; CHECK-NEXT: v_readlane_b32 s59, v7, 31 ; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: v_readlane_b32 s44, v7, 16 ; CHECK-NEXT: v_readlane_b32 s45, v7, 17 ; CHECK-NEXT: v_readlane_b32 s46, v7, 18 ; CHECK-NEXT: v_readlane_b32 s47, v7, 19 @@ -163,16 +171,6 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s49, v7, 21 ; CHECK-NEXT: v_readlane_b32 s50, v7, 22 ; CHECK-NEXT: v_readlane_b32 s51, v7, 23 -; CHECK-NEXT: v_readlane_b32 s52, v7, 24 -; CHECK-NEXT: v_readlane_b32 s53, v7, 25 -; CHECK-NEXT: v_readlane_b32 s54, v7, 26 -; CHECK-NEXT: v_readlane_b32 s55, v7, 27 -; CHECK-NEXT: v_readlane_b32 s56, v7, 28 -; CHECK-NEXT: v_readlane_b32 s57, v7, 29 -; CHECK-NEXT: v_readlane_b32 s58, v7, 30 -; CHECK-NEXT: v_readlane_b32 s59, v7, 31 -; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s44, v7, 0 ; CHECK-NEXT: v_readlane_b32 s52, v7, 8 ; CHECK-NEXT: v_readlane_b32 s53, v7, 9 ; CHECK-NEXT: v_readlane_b32 s54, v7, 10 @@ -181,12 +179,14 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_readlane_b32 s57, v7, 13 ; CHECK-NEXT: v_readlane_b32 s58, v7, 14 ; CHECK-NEXT: v_readlane_b32 s59, v7, 15 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_readlane_b32 s44, v7, 0 ; CHECK-NEXT: v_readlane_b32 s45, v7, 1 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s46, v7, 2 ; CHECK-NEXT: v_readlane_b32 s47, v7, 3 -; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 ; CHECK-NEXT: v_readlane_b32 s48, v7, 4 ; CHECK-NEXT: v_readlane_b32 s49, v7, 5 ; CHECK-NEXT: v_readlane_b32 s50, v7, 6 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..4a89b2fcc017c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -1826,10 +1826,10 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, ; GCN-NEXT: s_or_b32 s0, s0, s5 ; GCN-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_readlane_b32 s1, v6, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_readlane_b32 s0, v6, 0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index a21a405164b6a..689b38846c61b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10334,11 +10334,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_writelane_b32 v62, s3, 5 ; GFX8-NEXT: v_readlane_b32 s2, v62, 2 ; GFX8-NEXT: v_readlane_b32 s3, v62, 3 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_mov_b32_e32 v35, s49 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 ; GFX8-NEXT: v_mov_b32_e32 v13, s73 @@ -10632,8 +10632,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: v_readlane_b32 s3, v62, 5 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 75638c5fa8476..58375b6f8a8a4 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -139,13 +139,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: s_mov_b32 s12, s24 ; GFX906-NEXT: s_mov_b32 s13, s23 ; GFX906-NEXT: s_mov_b32 s14, s22 ; GFX906-NEXT: v_mov_b32_e32 v31, v32 ; GFX906-NEXT: s_mov_b32 s15, s21 ; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_mov_b32_e32 v40, v32 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -232,20 +232,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s6, v39, 8 -; GFX906-NEXT: v_readlane_b32 s8, v39, 6 -; GFX906-NEXT: v_readlane_b32 s10, v39, 4 -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 +; GFX906-NEXT: v_readlane_b32 s4, v39, 10 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -253,19 +253,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: s_mov_b64 exec, s[34:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_readlane_b32 s4, v39, 10 -; GFX906-NEXT: v_readlane_b32 s6, v39, 8 -; GFX906-NEXT: v_readlane_b32 s8, v39, 6 -; GFX906-NEXT: v_readlane_b32 s10, v39, 4 -; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s5, v39, 11 +; GFX906-NEXT: v_readlane_b32 s6, v39, 8 ; GFX906-NEXT: v_readlane_b32 s7, v39, 9 +; GFX906-NEXT: v_readlane_b32 s8, v39, 6 ; GFX906-NEXT: v_readlane_b32 s9, v39, 7 +; GFX906-NEXT: v_readlane_b32 s10, v39, 4 ; GFX906-NEXT: v_readlane_b32 s11, v39, 5 ; GFX906-NEXT: v_readlane_b32 s12, v39, 3 ; GFX906-NEXT: v_readlane_b32 s13, v39, 2 ; GFX906-NEXT: v_readlane_b32 s14, v39, 1 ; GFX906-NEXT: v_readlane_b32 s15, v39, 0 ; GFX906-NEXT: v_mov_b32_e32 v31, v40 +; GFX906-NEXT: v_readlane_b32 s16, v39, 22 ; GFX906-NEXT: v_readlane_b32 s17, v39, 23 ; GFX906-NEXT: v_readlane_b32 s21, v39, 12 ; GFX906-NEXT: ;;#ASMSTART @@ -528,13 +528,13 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: s_mov_b32 s12, s24 ; GFX908-NEXT: s_mov_b32 s13, s23 ; GFX908-NEXT: s_mov_b32 s14, s22 ; GFX908-NEXT: v_mov_b32_e32 v31, v32 ; GFX908-NEXT: s_mov_b32 s15, s21 ; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_mov_b32_e32 v40, v32 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] @@ -621,20 +621,20 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s6, v39, 8 -; GFX908-NEXT: v_readlane_b32 s8, v39, 6 -; GFX908-NEXT: v_readlane_b32 s10, v39, 4 -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 +; GFX908-NEXT: v_readlane_b32 s4, v39, 10 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -642,19 +642,19 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: s_mov_b64 exec, s[34:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readlane_b32 s4, v39, 10 -; GFX908-NEXT: v_readlane_b32 s6, v39, 8 -; GFX908-NEXT: v_readlane_b32 s8, v39, 6 -; GFX908-NEXT: v_readlane_b32 s10, v39, 4 -; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s5, v39, 11 +; GFX908-NEXT: v_readlane_b32 s6, v39, 8 ; GFX908-NEXT: v_readlane_b32 s7, v39, 9 +; GFX908-NEXT: v_readlane_b32 s8, v39, 6 ; GFX908-NEXT: v_readlane_b32 s9, v39, 7 +; GFX908-NEXT: v_readlane_b32 s10, v39, 4 ; GFX908-NEXT: v_readlane_b32 s11, v39, 5 ; GFX908-NEXT: v_readlane_b32 s12, v39, 3 ; GFX908-NEXT: v_readlane_b32 s13, v39, 2 ; GFX908-NEXT: v_readlane_b32 s14, v39, 1 ; GFX908-NEXT: v_readlane_b32 s15, v39, 0 ; GFX908-NEXT: v_mov_b32_e32 v31, v40 +; GFX908-NEXT: v_readlane_b32 s16, v39, 22 ; GFX908-NEXT: v_readlane_b32 s17, v39, 23 ; GFX908-NEXT: v_readlane_b32 s21, v39, 12 ; GFX908-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 15f5f890d57b5..d1dee534414ac 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -8,6 +8,7 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-LABEL: kernel0: ; CHECK: ; %bb.0: +; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART @@ -19,10 +20,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[48:51] ; CHECK-NEXT: ;;#ASMEND @@ -123,19 +123,19 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_writelane_b32 v22, s0, 58 ; CHECK-NEXT: v_writelane_b32 v22, s1, 59 ; CHECK-NEXT: v_writelane_b32 v22, s2, 60 +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v22, s3, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane -; CHECK-NEXT: v_writelane_b32 v22, s0, 62 ; CHECK-NEXT: v_writelane_b32 v23, s2, 0 ; CHECK-NEXT: v_writelane_b32 v23, s3, 1 ; CHECK-NEXT: v_writelane_b32 v23, s4, 2 ; CHECK-NEXT: v_writelane_b32 v23, s5, 3 ; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v22, s1, 63 +; CHECK-NEXT: v_writelane_b32 v22, s0, 62 ; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: v_writelane_b32 v22, s1, 63 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND @@ -208,6 +208,9 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 2 ; CHECK-NEXT: v_readlane_b32 s1, v22, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s2, v22, 4 ; CHECK-NEXT: v_readlane_b32 s3, v22, 5 ; CHECK-NEXT: v_readlane_b32 s4, v22, 6 @@ -215,9 +218,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s6, v22, 8 ; CHECK-NEXT: v_readlane_b32 s7, v22, 9 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 10 @@ -241,29 +241,23 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 26 ; CHECK-NEXT: v_readlane_b32 s1, v22, 27 -; CHECK-NEXT: v_readlane_b32 s2, v22, 28 -; CHECK-NEXT: v_readlane_b32 s3, v22, 29 -; CHECK-NEXT: v_readlane_b32 s4, v22, 30 -; CHECK-NEXT: v_readlane_b32 s5, v22, 31 -; CHECK-NEXT: v_readlane_b32 s6, v22, 32 -; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[44:47] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s2, v22, 28 +; CHECK-NEXT: v_readlane_b32 s3, v22, 29 +; CHECK-NEXT: v_readlane_b32 s4, v22, 30 +; CHECK-NEXT: v_readlane_b32 s5, v22, 31 +; CHECK-NEXT: v_readlane_b32 s6, v22, 32 +; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 34 ; CHECK-NEXT: v_readlane_b32 s1, v22, 35 -; CHECK-NEXT: v_readlane_b32 s2, v22, 36 -; CHECK-NEXT: v_readlane_b32 s3, v22, 37 -; CHECK-NEXT: v_readlane_b32 s4, v22, 38 -; CHECK-NEXT: v_readlane_b32 s5, v22, 39 -; CHECK-NEXT: v_readlane_b32 s6, v22, 40 -; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND @@ -273,6 +267,12 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[40:43] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s2, v22, 36 +; CHECK-NEXT: v_readlane_b32 s3, v22, 37 +; CHECK-NEXT: v_readlane_b32 s4, v22, 38 +; CHECK-NEXT: v_readlane_b32 s5, v22, 39 +; CHECK-NEXT: v_readlane_b32 s6, v22, 40 +; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND @@ -297,11 +297,11 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v22, 58 ; CHECK-NEXT: v_readlane_b32 s1, v22, 59 -; CHECK-NEXT: v_readlane_b32 s2, v22, 60 -; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index e7bc851817f3a..6b0ede1ac3ab8 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -278,10 +278,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 -; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 ; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 +; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 ; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 @@ -587,11 +587,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index e9a0671ead4e0..57ddcb20d613c 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -158,7 +158,6 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 2 ; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 72672c8b6efad..6a3a58e3ab120 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -135,7 +135,6 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill @@ -965,7 +964,6 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 ; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 ; GFX9-O0-NEXT: s_mov_b32 s0, 0 -; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill From c648db57b5e22d7aa8abe4d19779b68f8a502567 Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 24 Nov 2025 21:21:20 -0600 Subject: [PATCH 2/5] Update kill/undef flags to satisfy MachineVerifier Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 10 ++++++++++ .../CodeGen/AMDGPU/fold-reload-into-exec.mir | 16 ++++++++-------- llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir | 4 ++-- .../test/CodeGen/AMDGPU/insert-waitcnts-crash.ll | 6 +++--- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 66e1873319553..27c39933dc682 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2095,6 +2095,16 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::SI_SPILL_S32_TO_VGPR: mutateAndCleanupImplicit(MI, get(AMDGPU::V_WRITELANE_B32)); + // When leftover implicit-def operands are removed, kill flag is no longer + // valid. Thus: + // $X = SI_SPILL_S32_TO_VGPR killed $sgpr0, 0, $X(tied-def 0), + // implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + // must be converted to: + // $X = V_WRITELANE_B32 $sgpr0, 0, $X(tied-def 0) + MI.getOperand(1).setIsKill(false); + // Sometimes a SGPR that has already been killed is spilled. + // Add undef to appease the MachineVerifier. + MI.getOperand(1).setIsUndef(true); break; case AMDGPU::SI_RESTORE_S32_FROM_VGPR: diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir index 744871d8c84ff..4ee2e101521e9 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -15,7 +15,7 @@ body: | ; CHECK: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -40,7 +40,7 @@ body: | ; CHECK: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -65,8 +65,8 @@ body: | ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr1, 1, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 @@ -95,7 +95,7 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -118,7 +118,7 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -141,8 +141,8 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr1, 1, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir index 1c2436bd6b6cd..1712e32c42a14 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -16,7 +16,7 @@ body: | ; CHECK: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 ; CHECK-NEXT: $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 @@ -45,7 +45,7 @@ body: | ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 ; CHECK: $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 undef $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll index 58cd2f5bc11af..e17b663ac5923 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll @@ -16,7 +16,7 @@ define fastcc i32 @foo() { ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17 ; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 undef $sgpr16, 2, undef $vgpr40 ; CHECK-NEXT: BUNDLE implicit-def $sgpr16_sgpr17, implicit-def $sgpr16, implicit-def $scc, implicit-def $sgpr17 { ; CHECK-NEXT: $sgpr16_sgpr17 = S_GETPC_B64 ; CHECK-NEXT: $sgpr16 = S_ADD_U32 internal $sgpr16, target-flags(amdgpu-gotprel32-lo) @bar + 4, implicit-def $scc @@ -26,8 +26,8 @@ define fastcc i32 @foo() { ; CHECK-NEXT: BUFFER_GL1_INV implicit $exec ; CHECK-NEXT: BUFFER_GL0_INV implicit $exec ; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40 - ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 undef $sgpr30, 0, $vgpr40 + ; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 undef $sgpr31, 1, $vgpr40 ; CHECK-NEXT: S_WAITCNT 49279 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo From 4e86066af793a9c89e8c11b06aadccbe90c8340f Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 25 Nov 2025 13:09:59 -0600 Subject: [PATCH 3/5] Add testcase that spills a killed register Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/spilled_kill.mir | 168 ++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/spilled_kill.mir diff --git a/llvm/test/CodeGen/AMDGPU/spilled_kill.mir b/llvm/test/CodeGen/AMDGPU/spilled_kill.mir new file mode 100644 index 0000000000000..c5389ec448539 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spilled_kill.mir @@ -0,0 +1,168 @@ +# RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -start-before=prologepilog -stop-after=postrapseudos -o - %s | FileCheck -check-prefix=GCN %s + +# This testcase shows the necessity of adding an undef flag to the $sgpr source when +# creating a V_WRITELANE_B32. +# +# The prologepilog pass will create a: +# +# $exec = S_MOV_B64 killed $sgpr4_sgpr5 +# +# but $sgpr5 will be spilled by a: +# +# $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, killed $vgpr1(tied-def 0), implicit $sgpr4_sgpr5 +# +# The SI_SPILL_S32_TO_VGPR will be converted in postrapesudos to: +# +# $vgpr1 = V_WRITELANE_B32 undef $sgpr5, 1, killed $vgpr1(tied-def 0) +# +# The undef flag is necessary to satisfy the MachineVerifier since $sgpr5 is used after a kill. +# This testcase was derived from function bitcast_v32i16_to_v64i8_scalar in amdgcn.bitcast.512bit.ll. + +# GCN-LABEL: name: bitcast_v32i16_to_v64i8_scalar +--- +name: bitcast_v32i16_to_v64i8_scalar +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + explicitKernArgSize: 0 + maxKernArgAlign: 1 + ldsSize: 0 + gdsSize: 0 + dynLDSAlign: 1 + isEntryFunction: false + isChainFunction: false + noSignedZerosFPMath: false + memoryBound: false + waveLimiter: false + hasSpilledSGPRs: true + hasSpilledVGPRs: true + numWaveDispatchSGPRs: 30 + numWaveDispatchVGPRs: 20 + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + bytesInStackArgArea: 0 + returnsVoid: true + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + queuePtr: { reg: '$sgpr6_sgpr7' } + dispatchID: { reg: '$sgpr10_sgpr11' } + workGroupIDX: { reg: '$sgpr12' } + workGroupIDY: { reg: '$sgpr13' } + workGroupIDZ: { reg: '$sgpr14' } + LDSKernelId: { reg: '$sgpr15' } + implicitArgPtr: { reg: '$sgpr8_sgpr9' } + workItemIDX: { reg: '$vgpr31', mask: 1023 } + workItemIDY: { reg: '$vgpr31', mask: 1047552 } + workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } + psInputAddr: 0 + psInputEnable: 0 + maxMemoryClusterDWords: 8 + mode: + ieee: true + dx10-clamp: true + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + highBitsOf32BitAddress: 0 + occupancy: 3 + spillPhysVGPRs: + - '$vgpr63' + wwmReservedRegs: + - '$vgpr63' + - '$vgpr62' + vgprForAGPRCopy: '' + sgprForEXECCopy: '$sgpr100_sgpr101' + longBranchReservedReg: '' + hasInitWholeWave: false + dynamicVGPRBlockSize: 0 + scratchReservedForDynamicVGPRs: 0 + numKernargPreloadSGPRs: 0 + isWholeWaveFunction: false +body: | + bb.0: + successors: %bb.3(0x40000000), %bb.1(0x40000000) + + renamable $sgpr4 = IMPLICIT_DEF + $vgpr62 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, killed $vgpr62, implicit $sgpr4_sgpr5 + S_CBRANCH_EXECZ %bb.1, implicit $exec + S_BRANCH %bb.3 + + bb.1: + successors: %bb.2(0x40000000), %bb.4(0x40000000) + + S_CBRANCH_EXECZ %bb.4, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.4(0x80000000) + + S_BRANCH %bb.4 + + bb.3: + successors: %bb.1(0x80000000) + + S_BRANCH %bb.1 + + bb.4: + + SI_RETURN +... From 91a43676bfc1e96e7a6a6a8197f34a73321b3f34 Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 25 Nov 2025 16:50:02 -0600 Subject: [PATCH 4/5] Minimize testcase Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/spilled_kill.mir | 112 +--------------------- 1 file changed, 1 insertion(+), 111 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/spilled_kill.mir b/llvm/test/CodeGen/AMDGPU/spilled_kill.mir index c5389ec448539..cc9cba06a54e6 100644 --- a/llvm/test/CodeGen/AMDGPU/spilled_kill.mir +++ b/llvm/test/CodeGen/AMDGPU/spilled_kill.mir @@ -11,7 +11,7 @@ # # $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, killed $vgpr1(tied-def 0), implicit $sgpr4_sgpr5 # -# The SI_SPILL_S32_TO_VGPR will be converted in postrapesudos to: +# The SI_SPILL_S32_TO_VGPR will be converted in postrapseudos to: # # $vgpr1 = V_WRITELANE_B32 undef $sgpr5, 1, killed $vgpr1(tied-def 0) # @@ -21,122 +21,12 @@ # GCN-LABEL: name: bitcast_v32i16_to_v64i8_scalar --- name: bitcast_v32i16_to_v64i8_scalar -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -failedISel: false tracksRegLiveness: true -hasWinCFI: false -noPhis: true -isSSA: false -noVRegs: true -hasFakeUses: false -callsEHReturn: false -callsUnwindInit: false -hasEHContTarget: false -hasEHScopes: false -hasEHFunclets: false -isOutlined: false -debugInstrRef: false -failsVerification: false -tracksDebugUserValues: true -registers: [] -liveins: [] -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - functionContext: '' - maxCallFrameSize: 4294967295 - cvBytesOfCalleeSavedRegisters: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - hasTailCall: false - isCalleeSavedInfoValid: false - localFrameSize: 0 -fixedStack: [] -stack: - - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } - - { id: 2, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, - stack-id: default, callee-saved-register: '', callee-saved-restored: true, - debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -entry_values: [] -callSites: [] -debugValueSubstitutions: [] -constants: [] machineFunctionInfo: - explicitKernArgSize: 0 - maxKernArgAlign: 1 - ldsSize: 0 - gdsSize: 0 - dynLDSAlign: 1 - isEntryFunction: false - isChainFunction: false - noSignedZerosFPMath: false - memoryBound: false - waveLimiter: false - hasSpilledSGPRs: true - hasSpilledVGPRs: true - numWaveDispatchSGPRs: 30 - numWaveDispatchVGPRs: 20 scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - frameOffsetReg: '$sgpr33' stackPtrOffsetReg: '$sgpr32' - bytesInStackArgArea: 0 - returnsVoid: true - argumentInfo: - privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } - dispatchPtr: { reg: '$sgpr4_sgpr5' } - queuePtr: { reg: '$sgpr6_sgpr7' } - dispatchID: { reg: '$sgpr10_sgpr11' } - workGroupIDX: { reg: '$sgpr12' } - workGroupIDY: { reg: '$sgpr13' } - workGroupIDZ: { reg: '$sgpr14' } - LDSKernelId: { reg: '$sgpr15' } - implicitArgPtr: { reg: '$sgpr8_sgpr9' } - workItemIDX: { reg: '$vgpr31', mask: 1023 } - workItemIDY: { reg: '$vgpr31', mask: 1047552 } - workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } - psInputAddr: 0 - psInputEnable: 0 - maxMemoryClusterDWords: 8 - mode: - ieee: true - dx10-clamp: true - fp32-input-denormals: true - fp32-output-denormals: true - fp64-fp16-input-denormals: true - fp64-fp16-output-denormals: true - highBitsOf32BitAddress: 0 - occupancy: 3 - spillPhysVGPRs: - - '$vgpr63' wwmReservedRegs: - - '$vgpr63' - '$vgpr62' - vgprForAGPRCopy: '' - sgprForEXECCopy: '$sgpr100_sgpr101' - longBranchReservedReg: '' - hasInitWholeWave: false - dynamicVGPRBlockSize: 0 - scratchReservedForDynamicVGPRs: 0 - numKernargPreloadSGPRs: 0 - isWholeWaveFunction: false body: | bb.0: successors: %bb.3(0x40000000), %bb.1(0x40000000) From 682267dda243292f7e7aa0ee1da694d2094f184d Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 25 Nov 2025 18:19:36 -0600 Subject: [PATCH 5/5] Test necessity of removing kill flag. Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/invalid-kill.mir | 57 +++++++++++++++++++ .../{spilled_kill.mir => spilled-kill.mir} | 0 2 files changed, 57 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/invalid-kill.mir rename llvm/test/CodeGen/AMDGPU/{spilled_kill.mir => spilled-kill.mir} (100%) diff --git a/llvm/test/CodeGen/AMDGPU/invalid-kill.mir b/llvm/test/CodeGen/AMDGPU/invalid-kill.mir new file mode 100644 index 0000000000000..5d32e8cd89234 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/invalid-kill.mir @@ -0,0 +1,57 @@ +# RUN: llc -mtriple=amdgcn -verify-machineinstrs=1 -start-before=postrapseudos -stop-after=postrapseudos -O0 -o - %s | FileCheck -check-prefix=GCN %s + +# This testcase shows the necessity of removing the killed flag from the $sgpr source when +# creating a V_WRITELANE_B32. The SI_SPILL_S32_TO_VGPR will be converted in postrapseudos from: +# +# $X = SI_SPILL_S32_TO_VGPR killed $sgpr0, 6, $X, implicit-def $sgpr0_sgpr1 +# +# to: +# +# $X = V_WRITELANE_B32 $sgpr0, 6, $X(tied-def 0) +# +# The killed flag must be removed since $sgpr0 is no longer an implicit-def and +# will be subsequently used in the S_MOV_B64. + +# GCN-LABEL: name: non_uniform_loop +--- +name: non_uniform_loop +tracksRegLiveness: true +machineFunctionInfo: + wwmReservedRegs: + - '$vgpr63' +body: | + bb.0: + successors: %bb.1(0x80000000) + liveins: $vgpr0 + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x40000000), %bb.3(0x40000000) + + $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2, implicit-def $sgpr0_sgpr1 + $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr0, 6, $vgpr63, implicit-def $sgpr0_sgpr1 + $exec = S_MOV_B64_term killed renamable $sgpr0_sgpr1 + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3(0x80000000) + + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4(0x40000000), %bb.1(0x40000000) + + $exec = S_ANDN2_B64_term $exec, killed undef renamable $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.4 + + bb.4: + successors: %bb.5(0x80000000) + + $exec = S_OR_B64_term $exec, killed undef renamable $sgpr0_sgpr1, implicit-def dead $scc + + bb.5: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/spilled_kill.mir b/llvm/test/CodeGen/AMDGPU/spilled-kill.mir similarity index 100% rename from llvm/test/CodeGen/AMDGPU/spilled_kill.mir rename to llvm/test/CodeGen/AMDGPU/spilled-kill.mir